# TASK 1(a): WEB SCRAPING 

Requirements:
Identify target website & inspect structure
Use requests + BeautifulSoup
Extract structured data
Handle pagination

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
# Base URL of the site to be scraped 
BASE_URL = "http://books.toscrape.com/catalogue/page-{}.html"

In [3]:
# List to store extracted book data
books = []

In [4]:
# Scraping the first page

url = "http://books.toscrape.com/catalogue/page-1.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all("article", class_="product_pod")

for item in items:
    title = item.h3.a["title"]
    price = item.find("p", class_="price_color").text
    availability = item.find("p", class_="instock availability").text.strip()
    rating = item.p["class"][1]
    
    books.append({
        "title": title,
        "price": price,
        "availability": availability,
        "rating": rating
    })

In [5]:
# Pagination: Looping through to find and scrape all book items (page 1 to page 50)

items = soup.find_all("article", class_="product_pod")

for page in range(2, 51):
    url = f"http://books.toscrape.com/catalogue/page-{page}.html"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    items = soup.find_all("article", class_="product_pod")
    if not items:
        break
    
    for item in items:
        title = item.h3.a["title"]
        price = item.find("p", class_="price_color").text
        availability = item.find("p", class_="instock availability").text.strip()
        rating = item.p["class"][1]
        
        books.append({
            "title": title,
            "price": price,
            "availability": availability,
            "rating": rating
        })

In [6]:
# Previewing first few results
print(f"Total books collected: {len(books)}")
print(books[:5])

Total books collected: 1000
[{'title': 'A Light in the Attic', 'price': 'Â£51.77', 'availability': 'In stock', 'rating': 'Three'}, {'title': 'Tipping the Velvet', 'price': 'Â£53.74', 'availability': 'In stock', 'rating': 'One'}, {'title': 'Soumission', 'price': 'Â£50.10', 'availability': 'In stock', 'rating': 'One'}, {'title': 'Sharp Objects', 'price': 'Â£47.82', 'availability': 'In stock', 'rating': 'Four'}, {'title': 'Sapiens: A Brief History of Humankind', 'price': 'Â£54.23', 'availability': 'In stock', 'rating': 'Five'}]


In [7]:
# Converting to and saving as DataFrame
df = pd.DataFrame(books) 
df.to_csv("books_data.csv", index=False) 
print(df.shape)

(1000, 4)


In [8]:
df.head()

Unnamed: 0,title,price,availability,rating
0,A Light in the Attic,Â£51.77,In stock,Three
1,Tipping the Velvet,Â£53.74,In stock,One
2,Soumission,Â£50.10,In stock,One
3,Sharp Objects,Â£47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,Â£54.23,In stock,Five


In [9]:
df.shape

(1000, 4)