# Import Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import time

# Get URL and send GET request

In [2]:
url ="http://books.toscrape.com/"
response = requests.get(url)
if response.status_code == 200:
    print("Request successful")
else:
    print("Request failed")

Request successful


# Parse HTML Content

In [3]:
soup = BeautifulSoup(response.text, "html.parser")
print(soup)

# Extracting Page 1 Content

### Since the complete information of the books is in each book's link, we first need to access each book's link and the extract the information

In [None]:
# Finding all books on page 1. The books link is on an "h3"

books = soup.find_all("h3")

start_time = time.time()
books_extracted = 0

# For loop to iterate through the "h3"

for book in books:
    book_url = book.find("a")["href"]
    book_response = requests.get(url + book_url)
    book_soup =BeautifulSoup(book_response.content, "html.parser")
    
    title = book_soup.find("h1").text
    category = book_soup.find("ul", class_="breadcrumb").find_all("a")[2].text.strip()
    rating = book_soup.find("p", class_="star-rating")["class"][1]
    price = book_soup.find("p", class_="price_color").text.strip()
    availability = book_soup.find("p", class_="availability").text.strip()
    
    books_extracted += 1
    
    end_time = time.time()
    total_time = (end_time - start_time) / 60
    
    print(f'Title: {title}')
    print(f'category: {category}')
    print(f'rating: {rating}')
    print(f'price: {price}')
    print(f'availability: {availability}')
    print("***********")

# Extracting all pages

In [None]:
# Create a list

books_data = []

# Looping through all pages

for page_num in range (1,51):
    url = f'http://books.toscrape.com/catalogue/page-{page_num}.html'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    books = soup.find_all("h3")

    start_time = time.time()
    books_extracted = 0

    for book in books:
        book_url = book.find("a")["href"]
        book_response = requests.get('http://books.toscrape.com/catalogue/' + book_url)
        book_soup = BeautifulSoup(book_response.content, "html.parser")
    
        title = book_soup.find("h1").text
        category = book_soup.find("ul", class_="breadcrumb").find_all("a")[2].text.strip()
        rating = book_soup.find("p", class_="star-rating")["class"][1]
        price = book_soup.find("p", class_="price_color").text.strip()
        availability = book_soup.find("p", class_="availability").text.strip()
    
        books_extracted += 1
    
        end_time = time.time()
        total_time = (end_time - start_time) / 60
    
        books_data.append([title,category,rating,price,availability])
        print(books_data)
        print("************")
        print(f'Total time taken: {total_time} minutes')
        print("************")
        print(f'{page_num * len(books)} books extracted so far...')
        
# Adding information to the list

# Exporting Data

In [8]:
# Convert list into a df
df = pd.DataFrame(books_data, columns=["Title", "Category", "Rating", "Price", "Availability"])

df.head(10)

Unnamed: 0,Title,Category,Rating,Price,Availability
0,A Light in the Attic,Poetry,Three,£51.77,In stock (22 available)
1,Tipping the Velvet,Historical Fiction,One,£53.74,In stock (20 available)
2,Soumission,Fiction,One,£50.10,In stock (20 available)
3,Sharp Objects,Mystery,Four,£47.82,In stock (20 available)
4,Sapiens: A Brief History of Humankind,History,Five,£54.23,In stock (20 available)
5,The Requiem Red,Young Adult,One,£22.65,In stock (19 available)
6,The Dirty Little Secrets of Getting Your Dream...,Business,Four,£33.34,In stock (19 available)
7,The Coming Woman: A Novel Based on the Life of...,Default,Three,£17.93,In stock (19 available)
8,The Boys in the Boat: Nine Americans and Their...,Default,Four,£22.60,In stock (19 available)
9,The Black Maria,Poetry,One,£52.15,In stock (19 available)


# Creating a csv file

In [9]:
df.to_csv("books_scraped.csv", index = False)
print("Data saved into csv file")

Data saved into csv file
