In [1]:
# import pandas for data storage
import pandas as pd

# import requests to fetch the webpage
import requests

# import BeautifulSoup for parsing HTML
from bs4 import BeautifulSoup

# import numpy for handling missing data
import numpy as np

# import os to work with files and folder on the computer
import os

In [2]:
# create a header to tell the website that the request is coming from a real web browser
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
}

# send a request to the website to get the HTML content of page 1
response = requests.get("https://books.toscrape.com/catalogue/page-1.html", headers=headers)

# display the HTML content returned by the website
response.text



In [3]:
# convert the HTML text from the website into a BeautifulSoup object so that we can easily search and extract data from it
soup = BeautifulSoup(response.text)

In [4]:
# create an empty list to store the extracted data
data = []

In [5]:
# loop through page numbers from 1 to 50
for i in range(1, 51):

    # download the HTML content of each page
    response = requests.get(
        f"https://books.toscrape.com/catalogue/page-{i}.html",
        headers=headers
    )

    # parse the page HTML using BeautifulSoup
    soup = BeautifulSoup(response.text)

    # print the current page number
    print("Page", i)

    # select all book blocks on the page
    books_articles = soup.select("article.product_pod")

    # loop through each book block
    for book_article in books_articles:

        # get the link of the individual book page
        link = book_article.select_one(".image_container a").get("href")

        # download the book detail page
        response = requests.get(
            f"https://books.toscrape.com/catalogue/{link}",
            headers=headers
        )

        # parse the book page HTML
        soup_book = BeautifulSoup(response.text)

        # get the book thumbnail image link
        thumbnail = soup_book.select_one(".thumbnail img").get("src")

        # convert relative image URL to full URL
        thumbnail = os.path.join(
            "https://books.toscrape.com/",
            thumbnail[6:]
        )

        # get the book title
        title = soup_book.select_one(".product_main h1").text

        # get the book description
        description = soup_book.select_one("#product_description + p").text

        # get the book UPC code
        upc = soup_book.select_one(".table td").text

        # store all book details in the list as a dictionary
        data.append({
            "thumbnail": thumbnail,
            "title": title,
            "description": description,
            "upc": upc
        })

        # print the book title with indentation
        print("\t", title)

Page 1
	 A Light in the Attic
	 Tipping the Velvet
	 Soumission
	 Sharp Objects
	 Sapiens: A Brief History of Humankind
	 The Requiem Red
	 The Dirty Little Secrets of Getting Your Dream Job
	 The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull
	 The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics
	 The Black Maria
	 Starving Hearts (Triangular Trade Trilogy, #1)
	 Shakespeare's Sonnets
	 Set Me Free
	 Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)
	 Rip it Up and Start Again
	 Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991
	 Olio
	 Mesaerion: The Best Science Fiction Stories 1800-1849
	 Libertarianism for Beginners
	 It's Only the Himalayas
Page 2
	 In Her Wake
	 How Music Works
	 Foolproof Preserving: A Guide to Small Batch Jams, Jellies, Pickles, Condiments, and More: A Foolproof Guide to Making Small Batch Jams, Jellies, Pickles, Condiments, and More
	 Chase M

AttributeError: 'NoneType' object has no attribute 'text'

In [6]:
# convert the list of scraped data into a pandas DataFrame (table format)
df = pd.DataFrame(data)

In [7]:
# display the first 5 rows of the DataFrame to quickly check the data
df.head()

Unnamed: 0,thumbnail,title,description,upc
0,https://books.toscrape.com/media/cache/fe/72/f...,A Light in the Attic,It's hard to imagine a world without A Light i...,a897fe39b1053632
1,https://books.toscrape.com/media/cache/08/e9/0...,Tipping the Velvet,"""Erotic and absorbing...Written with starling ...",90fa61229261140a
2,https://books.toscrape.com/media/cache/ee/cf/e...,Soumission,"Dans une France assez proche de la nÃ´tre, un ...",6957f44c3847a760
3,https://books.toscrape.com/media/cache/c0/59/c...,Sharp Objects,"WICKED above her hipbone, GIRL across her hear...",e00eb4fd7b871a48
4,https://books.toscrape.com/media/cache/ce/5f/c...,Sapiens: A Brief History of Humankind,From a renowned historian comes a groundbreaki...,4165285e1663650f


In [8]:
# display summary information about the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   thumbnail    160 non-null    object
 1   title        160 non-null    object
 2   description  160 non-null    object
 3   upc          160 non-null    object
dtypes: object(4)
memory usage: 5.1+ KB


In [9]:
# save the DataFrame as a CSV file named "books.csv"
df.to_csv("books.csv")