In [1]:
import csv
import requests
from bs4 import BeautifulSoup

In [2]:
url = "http://books.toscrape.com/catalogue/category/books_1/index.html"
response = requests.get(url)
response

<Response [200]>

In [3]:
soup = BeautifulSoup(response.text, "html.parser")

In [4]:
# Find all <a> tags within the specified <div>
links = soup.find('div', class_='side_categories').find_all('a')

# Extract the href attributes from the <a> tags
links_list = [link.get('href') for link in links]
links_list = links_list[1:-1]

links_list[:2]

['../books/travel_2/index.html', '../books/mystery_3/index.html']

In [5]:
books_categories_links_list = []

for link in links_list:
    # Replace the initial part of the URL
    new_link = link.replace('../', 'http://books.toscrape.com/catalogue/category/')
    # Append the modified link to the new list
    books_categories_links_list.append(new_link)
    
books_categories_links_list[:2]

['http://books.toscrape.com/catalogue/category/books/travel_2/index.html',
 'http://books.toscrape.com/catalogue/category/books/mystery_3/index.html']

In [6]:
books_names_lst = []

for url in books_categories_links_list:
    books_names_lst.append(url.split('http://books.toscrape.com/catalogue/category/books/')[1].split('_')[0])
    
print(books_names_lst)

['travel', 'mystery', 'historical-fiction', 'sequential-art', 'classics', 'philosophy', 'romance', 'womens-fiction', 'fiction', 'childrens', 'religion', 'nonfiction', 'music', 'default', 'science-fiction', 'sports-and-games', 'add-a-comment', 'fantasy', 'new-adult', 'young-adult', 'science', 'poetry', 'paranormal', 'art', 'psychology', 'autobiography', 'parenting', 'adult-fiction', 'humor', 'horror', 'history', 'food-and-drink', 'christian-fiction', 'business', 'biography', 'thriller', 'contemporary', 'spirituality', 'academic', 'self-help', 'historical', 'christian', 'suspense', 'short-stories', 'novels', 'health', 'politics', 'cultural', 'erotica']


In [7]:
def get_books_titles(soup):
    books_titles = soup.find_all('h3')
    books_titles_lst = []

    for book_title in books_titles:
        book_title = book_title.find('a').attrs['title']
        books_titles_lst.append(book_title)
    
    return books_titles_lst

In [8]:
def get_books_prices(soup):
    prices = soup.find_all('p', attrs={"class": "price_color"})
    price_lst = []

    for price in prices:
        price = price.get_text()
        price = float(price.replace('Â£', ''))
        price_lst.append(price)
    
    return price_lst

In [9]:
def get_books_ratings(soup):
    books_rating = soup.find_all('p', attrs={"class": 'star-rating'})
    books_rating_lst = []
    books_rating_dict = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}

    for book_rating in books_rating:
        book_rating = book_rating.attrs['class'][1]
        book_rating = books_rating_dict[book_rating]
        books_rating_lst.append(book_rating)
    
    return books_rating_lst

In [10]:
csv_file = r"G:\books scraping\categories\Books Scraping.csv"

# Write the CSV header only once before the loop
with open(csv_file, 'w', encoding="utf-8", newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['Title', 'Price', 'Rate', 'Category'])
    writer.writeheader()

# Iterate over each URL and name simultaneously
for url, name in zip(books_categories_links_list, books_names_lst):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    books_titles_lst = get_books_titles(soup)
    price_lst = get_books_prices(soup)
    books_rating_lst = get_books_ratings(soup)
    
    # Append data to the CSV file inside the loop
    with open(csv_file, 'a', encoding="utf-8", newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['Title', 'Price', 'Rate', 'Category'])

        for book_title, price, rate in zip(books_titles_lst, price_lst, books_rating_lst):
            writer.writerow({"Title": book_title, 'Price': price, 'Rate': rate, 'Category': name})