In [1]:
## Import our libraries

from bs4 import BeautifulSoup  ## the BeautifulSoup library for scraping from the bs4 package
import requests ## Establish website connection using the requests library
import pandas as pd
import numpy as np
import re ## RegEx for pattern matching

In [2]:
## Starting with the Main page site

# Define the URL of the main page of the site to scrape
main_site = 'https://books.toscrape.com/index.html'

# Send a GET request to fetch the main page and store the server's response
main_site_resp = requests.get(main_site)

# Parse the HTML content of the main page using BeautifulSoup for easy manipulation
main_site_soup = BeautifulSoup(main_site_resp.text, 'html.parser')

In [11]:
## Extracting Section Links

def sections(soup):
    # Find all 'a' tags within the 'ul' tag with class 'nav nav-list'
    links = soup.find('ul', class_='nav nav-list').find_all('a')

    section_links = []

    # Iterate over each 'a' tag found
    for i in links:
        # Get the value of the 'href' attribute and append to section_links
        section_links.append(i.get('href'))

    # Modify each link in section_links to replace a specific substring
    for i in range(len(section_links)):
        section_links[i] = section_links[i].replace('catalogue/category/books/', 'https://books.toscrape.com/catalogue/category/books/')

    # Return the modified list of section links
    return section_links


In [12]:
# Putting relevant section links into a variable

sl = sections(main_site_soup)[1:]
#sl

['https://books.toscrape.com/catalogue/category/books/travel_2/index.html',
 'https://books.toscrape.com/catalogue/category/books/mystery_3/index.html',
 'https://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html',
 'https://books.toscrape.com/catalogue/category/books/sequential-art_5/index.html',
 'https://books.toscrape.com/catalogue/category/books/classics_6/index.html',
 'https://books.toscrape.com/catalogue/category/books/philosophy_7/index.html',
 'https://books.toscrape.com/catalogue/category/books/romance_8/index.html',
 'https://books.toscrape.com/catalogue/category/books/womens-fiction_9/index.html',
 'https://books.toscrape.com/catalogue/category/books/fiction_10/index.html',
 'https://books.toscrape.com/catalogue/category/books/childrens_11/index.html',
 'https://books.toscrape.com/catalogue/category/books/religion_12/index.html',
 'https://books.toscrape.com/catalogue/category/books/nonfiction_13/index.html',
 'https://books.toscrape.com/catalogue

In [15]:
## Extracting All Page Links within a Section

def all_page_links(section_links):
    all_links = []  # Initialize an empty list to store all extracted links
    for link in section_links:  # Iterate through each link in the input section_links
        next_page_links = [link]  # Start with the current link as the first element in next_page_links list
        while True:  # Infinite loop to fetch all subsequent pages until no 'next' link is found
            response = requests.get(link)  # Send a GET request to fetch the webpage content
            soup = BeautifulSoup(response.text, 'html.parser')  # Parse the HTML content using BeautifulSoup

            next_page_tag = soup.find('li', class_='next')  # Find the next page link tag

            if next_page_tag is None:  # If no 'next' link is found, break out of the loop
                break

            next_page = next_page_tag.a.get('href')  # Extract the href attribute from the 'next' link tag
            link = link.rsplit('/', 1)[0] + '/' + next_page  # Construct the full URL for the next page
            next_page_links.append(link)  # Add the constructed link to next_page_links list

        all_links.extend(next_page_links)  # Extend all_links with the links from next_page_links
    return all_links  # Return the list of all extracted page links


In [19]:
# Putting relevant page links into a variable

all_pages = all_page_links(sl)

In [17]:
## Extracting Book Links from All Pages

def book_links(all_pages):
    all_books = []  # Initialize an empty list to store all book links from all pages

    # Iterate over each page URL in the provided list of all_pages
    for link in all_pages:
        # Send a GET request to the page URL to fetch its content
        book_names_resp = requests.get(link)
        
        # Parse the HTML content of the page using BeautifulSoup
        book_names_soup = BeautifulSoup(book_names_resp.text, 'html.parser')

        # Find the <ol> element with class 'row' that contains the book names
        book_links = book_names_soup.find('ol', class_='row').find_all('h3')

        books = []  # Initialize an empty list to store individual book links

        # Iterate over each <h3> element (each representing a book) found in book_links
        for i in book_links:
            # Find the <a> tag within the <h3> and retrieve the 'href' attribute (book link)
            books.append(i.find('a').get('href'))

        # Modify each book link to use full URL instead of relative path
        for i in range(len(books)):
            books[i] = books[i].replace('../../../', 'https://books.toscrape.com/catalogue/')

        # Add the list of book links from the current page to the all_books list
        all_books.extend(books)

    return all_books  # Return the complete list of book links from all pages


In [33]:
# Putting relevant book links into a variable

all_books = book_links(all_pages)

In [43]:
## Extracting Book Information into a dataframe

def book_info_r(all_books):
  # Initialize an empty DataFrame to store all book information
  all_book_info = pd.DataFrame()

  # Iterate through each book link in the list of all_books
  for link in all_books:
    # Send a GET request to the book's page and create a BeautifulSoup object
    book_page_resp = requests.get(link)
    book_page_soup = BeautifulSoup(book_page_resp.text, 'html.parser')

    # Extract keys (column headers) and values (data) from the book's table
    keys_raw = book_page_soup.find('table').find_all('th')
    value_raw = book_page_soup.find('table').find_all('td')

    # Extract the book's name and category
    name = [book_page_soup.find('div', class_="col-sm-6 product_main").find('h1').text]
    category = [i.text for i in book_page_soup.find('ul', class_="breadcrumb").find_all('a')][-1]

    # Clean up keys and values (remove extra whitespace)
    keys = [i.text.strip() for i in keys_raw]
    values = [i.text.strip() for i in value_raw]

    # Create a DataFrame for the current book's information
    book_info = pd.DataFrame([values], columns=[keys])
    
    # Add columns for book's name and category to the DataFrame
    book_info['Name'] = name
    book_info['Category'] = category

    # Concatenate the current book's DataFrame with all_book_info
    all_book_info = pd.concat([all_book_info, book_info], ignore_index=True)

  # Return the complete DataFrame containing information for all books
  return all_book_info


In [236]:
# Putting relevant book information into a variable

book_info = book_info_r(all_books)
book_info

Unnamed: 0,UPC,Product Type,Price (excl. tax),Price (incl. tax),Tax,Availability,Number of reviews,Name,Category
0,a22124811bfa8350,Books,Â£45.17,Â£45.17,Â£0.00,In stock (19 available),0,It's Only the Himalayas,Travel
1,ce60436f52c5ee68,Books,Â£49.43,Â£49.43,Â£0.00,In stock (15 available),0,Full Moon over Noahâs Ark: An Odyssey to Mou...,Travel
2,f9705c362f070608,Books,Â£48.87,Â£48.87,Â£0.00,In stock (14 available),0,See America: A Celebration of Our National Par...,Travel
3,1809259a5a5f1d8d,Books,Â£36.94,Â£36.94,Â£0.00,In stock (8 available),0,Vagabonding: An Uncommon Guide to the Art of L...,Travel
4,a94350ee74deaa07,Books,Â£37.33,Â£37.33,Â£0.00,In stock (7 available),0,Under the Tuscan Sun,Travel
...,...,...,...,...,...,...,...,...,...
995,2b5054a4192e9b06,Books,Â£52.65,Â£52.65,Â£0.00,In stock (14 available),0,Why the Right Went Wrong: Conservatism--From G...,Politics
996,3968e3fbf4695d7c,Books,Â£56.86,Â£56.86,Â£0.00,In stock (12 available),0,Equal Is Unfair: America's Misguided Fight Aga...,Politics
997,bb8245f52c7cce8f,Books,Â£36.58,Â£36.58,Â£0.00,In stock (15 available),0,Amid the Chaos,Cultural
998,88c21fcd38e2486e,Books,Â£19.19,Â£19.19,Â£0.00,In stock (15 available),0,Dark Notes,Erotica


In [237]:
# Functions to clean the dataframe

# Function to extract numbers with decimals from a column
def num_clean(column):
    return re.findall('\d+\.\d+', str(column))

# Function to clean and restructure the dataframe
def cleaning(df):

  # Clean and convert 'Price (excl. tax)', 'Price (incl. tax)', and 'Tax' columns to float
  df['Price (excl. tax)'] = df['Price (excl. tax)'].apply(num_clean).astype(float)
  df['Price (incl. tax)'] = df['Price (incl. tax)'].apply(num_clean).astype(float)
  df['Tax'] = df['Tax'].apply(num_clean).astype(float)
  
  # Convert 'Availability' column to string type
  df['Availability'] = df['Availability'].astype(str)
  
  # Convert 'Number of reviews' column to float type
  df['Number of reviews'] = df['Number of reviews'].astype(float)

  # Reorder columns in the dataframe
  df = df[['UPC', 'Name', 'Category', 'Price (excl. tax)', 'Price (incl. tax)', 'Tax', 'Availability', 'Number of reviews']]

  return df


In [238]:
# Cleaning the dataframe

book_info_c = cleaning(book_info)

In [239]:
book_info_c.head()

Unnamed: 0,UPC,Name,Category,Price (excl. tax),Price (incl. tax),Tax,Availability,Number of reviews
0,a22124811bfa8350,It's Only the Himalayas,Travel,45.17,45.17,0.0,In stock (19 available),0.0
1,ce60436f52c5ee68,Full Moon over Noahâs Ark: An Odyssey to Mou...,Travel,49.43,49.43,0.0,In stock (15 available),0.0
2,f9705c362f070608,See America: A Celebration of Our National Par...,Travel,48.87,48.87,0.0,In stock (14 available),0.0
3,1809259a5a5f1d8d,Vagabonding: An Uncommon Guide to the Art of L...,Travel,36.94,36.94,0.0,In stock (8 available),0.0
4,a94350ee74deaa07,Under the Tuscan Sun,Travel,37.33,37.33,0.0,In stock (7 available),0.0


In [240]:
# Checking Data types of the columns in the dataframe

book_info_c.dtypes

UPC                   object
Name                  object
Category              object
Price (excl. tax)    float64
Price (incl. tax)    float64
Tax                  float64
Availability          object
Number of reviews    float64
dtype: object