In [1]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

main_url = "http://books.toscrape.com/index.html"

def getAndParseURL(url):
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    return(soup)


def getBooksURLs(url):
    soup = getAndParseURL(url)
    # remove the index.html part of the base url before returning the results
    return(["/".join(url.split("/")[:-1]) + "/" + x.div.a.get('href') for x in soup.findAll("article", class_ = "product_pod")])



# store all the results into a list
pages_urls = [main_url]

soup = getAndParseURL(pages_urls[0])

# while we get two matches, this means that the webpage contains a 'previous' and a 'next' button
# if there is only one button, this means that we are either on the first page or on the last page
# we stop when we get to the last page


while len(soup.findAll("a", href=re.compile("page"))) == 2 or len(pages_urls) == 1:
    
    # get the new complete url by adding the fetched URL to the base URL (and removing the .html part of the base URL)
    new_url = "/".join(pages_urls[-1].split("/")[:-1]) + "/" + soup.findAll("a", href=re.compile("page"))[-1].get("href")
    
    # add the URL to the list
    pages_urls.append(new_url)
    
    # parse the next page
    soup = getAndParseURL(new_url)
    
# print(str(len(pages_urls)) + " fetched URLs")
# print("Some examples:")
# pages_urls[:5]

booksURLs = []
for page in pages_urls:
    booksURLs.extend(getBooksURLs(page))


# print(str(len(booksURLs)) + " fetched URLs")
# print("Some examples:")
# booksURLs[:5]

# scrape data for every book URL: PS.this may take some time cuz we are scrapping all the pages
names = []
categories = []
for url in booksURLs:
    soup = getAndParseURL(url)
    # product name
    names.append(soup.find("div", class_ = re.compile("product_main")).h1.text)
    # product category
    categories.append(soup.find("a", href = re.compile("../category/books/")).get("href").split("/")[3])


# term and normalize categories & names 

trim_categories = []
for value in categories:
  result = value.replace("-", " ")
  trimming = re.sub(r'[^A-Za-z- ]', '', result)
  trim_categories.append(trimming)

trim_names = []
for value1 in names:
  result1 = value1.lower()
  trim_names.append(result1)


# add data into pandas df
scraped_dataNew = pd.DataFrame({'name': trim_names, "product_category": trim_categories}, index=None)


# check if a certain book is in a certain category
def in_stock(title,topic):
  if ((scraped_dataNew['name'] == title) & (scraped_dataNew['product_category'] == topic)).any():
    return True
  else:
    return False
  


In [2]:
scraped_dataNew

Unnamed: 0,name,product_category
0,a light in the attic,poetry
1,tipping the velvet,historical fiction
2,soumission,fiction
3,sharp objects,mystery
4,sapiens: a brief history of humankind,history
...,...,...
995,alice in wonderland (alice's adventures in won...,classics
996,"ajin: demi-human, volume 1 (ajin: demi-human #1)",sequential art
997,a spy's devotion (the regency spies of london #1),historical fiction
998,1st to die (women's murder club #1),mystery


In [3]:
in_stock('sapiens: a brief history of humankind', 'history')

True