In [39]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
url = 'https://books.toscrape.com/catalogue/category/books/travel_2/index.html'
response = requests.get(url)
response.status_code

200

In [22]:
soup = BeautifulSoup(response.content, 'lxml')

In [28]:
names = soup.find_all('h3')
names_lst = [name.a['title'] for name in names]
names_lst

["It's Only the Himalayas",
 'Full Moon over Noah’s Ark: An Odyssey to Mount Ararat and Beyond',
 'See America: A Celebration of Our National Parks & Treasured Sites',
 'Vagabonding: An Uncommon Guide to the Art of Long-Term World Travel',
 'Under the Tuscan Sun',
 'A Summer In Europe',
 'The Great Railway Bazaar',
 'A Year in Provence (Provence #1)',
 'The Road to Little Dribbling: Adventures of an American in Britain (Notes From a Small Island #2)',
 'Neither Here nor There: Travels in Europe',
 '1,000 Places to See Before You Die']

In [38]:
prices = soup.find_all('p', 'price_color')
prices_lst = [price.text.strip('£') for price in prices]
prices_lst

['45.17',
 '49.43',
 '48.87',
 '36.94',
 '37.33',
 '44.34',
 '30.54',
 '56.88',
 '23.21',
 '38.95',
 '26.08']

In [42]:
books_dict = {
    'Book Name': names_lst,
    'Price': prices_lst
}

books_df = pd.DataFrame(books_dict)
books_df

Unnamed: 0,Book Name,Price
0,It's Only the Himalayas,45.17
1,Full Moon over Noah’s Ark: An Odyssey to Mount...,49.43
2,See America: A Celebration of Our National Par...,48.87
3,Vagabonding: An Uncommon Guide to the Art of L...,36.94
4,Under the Tuscan Sun,37.33
5,A Summer In Europe,44.34
6,The Great Railway Bazaar,30.54
7,A Year in Provence (Provence #1),56.88
8,The Road to Little Dribbling: Adventures of an...,23.21
9,Neither Here nor There: Travels in Europe,38.95


In [66]:
num_of_books = int(soup.find_all('strong')[1].text)
num_of_pages = num_of_books // 20 + num_of_books % 20
num_of_pages

11

In [88]:
ratings = soup.find_all('p', 'star-rating')
ratings_lst =  [rating['class'][1] for rating in ratings]
ratings_lst

['Two',
 'Four',
 'Three',
 'Two',
 'Three',
 'Two',
 'One',
 'Four',
 'One',
 'Three',
 'Five']

In [93]:
category = 'sequential-art_5'
for page in range(1, 5):
  page_url = f'https://books.toscrape.com/catalogue/category/books/{category}/page-{page}.html'
  response = requests.get(page_url)
  soup = BeautifulSoup(response.content, 'lxml')

  names = soup.find_all('h3')
  names_lst = [name.a['title'] for name in names]

  prices = soup.find_all('p', 'price_color')
  prices_lst = [price.text.strip('£') for price in prices]

  ratings = soup.find_all('p', 'star-rating')
  ratings_lst =  [rating['class'][1] for rating in ratings]
  rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
  ratings_lst =  [rating_map[rating] for rating in ratings_lst]

  books_dict = {
      'Book Name': names_lst,
      'Price': prices_lst,
      'Rating': ratings_lst
  }

  books_df = pd.DataFrame(books_dict)

  if page == 1:
    all_books_df = books_df
  else:
    all_books_df = pd.concat([all_books_df, books_df], ignore_index=True)

In [96]:
all_books_df.head()

Unnamed: 0,Book Name,Price,Rating
0,Scott Pilgrim's Precious Little Life (Scott Pi...,52.29,5
1,Tsubasa: WoRLD CHRoNiCLE 2 (Tsubasa WoRLD CHRo...,16.28,1
2,This One Summer,19.49,4
3,The Nameless City (The Nameless City #1),38.16,4
4,"Saga, Volume 5 (Saga (Collected Editions) #5)",51.04,2


In [97]:
def scrape_books(category):
  url = 'https://books.toscrape.com/catalogue/category/books/travel_2/index.html'
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'lxml')

  num_of_books = int(soup.find_all('strong')[1].text)
  num_of_pages = num_of_books // 20 + num_of_books % 20

  all_books_df = pd.DataFrame()

  for page in range(1, 5):
    page_url = f'https://books.toscrape.com/catalogue/category/books/{category}/page-{page}.html'
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'lxml')

    names = soup.find_all('h3')
    names_lst = [name.a['title'] for name in names]

    prices = soup.find_all('p', 'price_color')
    prices_lst = [price.text.strip('£') for price in prices]

    ratings = soup.find_all('p', 'star-rating')
    ratings_lst =  [rating['class'][1] for rating in ratings]
    rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
    ratings_lst =  [rating_map[rating] for rating in ratings_lst]

    books_dict = {
        'Book Name': names_lst,
        'Price': prices_lst,
        'Rating': ratings_lst
    }

    books_df = pd.DataFrame(books_dict)

    all_books_df = pd.concat([all_books_df, books_df], ignore_index=True)

  return all_books_df

In [98]:
scrape_books('sequential-art_5')

Unnamed: 0,Book Name,Price,Rating
0,Scott Pilgrim's Precious Little Life (Scott Pi...,52.29,5
1,Tsubasa: WoRLD CHRoNiCLE 2 (Tsubasa WoRLD CHRo...,16.28,1
2,This One Summer,19.49,4
3,The Nameless City (The Nameless City #1),38.16,4
4,"Saga, Volume 5 (Saga (Collected Editions) #5)",51.04,2
...,...,...,...
70,"Hawkeye, Vol. 1: My Life as a Weapon (Hawkeye #1)",45.24,3
71,"Giant Days, Vol. 1 (Giant Days #1-4)",56.76,4
72,"Fruits Basket, Vol. 1 (Fruits Basket #1)",40.28,5
73,"Bleach, Vol. 1: Strawberry and the Soul Reaper...",34.65,5


In [99]:
url = 'https://books.toscrape.com/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')

In [122]:
categories = soup.find('ul', class_='nav nav-list').find('ul').find_all('li')
categories_cleaned = [category.a.text.strip() for category in categories]
categories_simplified = [category.lower().replace(' ', '-') + '_' + str(i+2) for i, category in enumerate(categories_cleaned)]
{k: v for k, v in zip(categories_cleaned, categories_simplified)}

{'Travel': 'travel_2',
 'Mystery': 'mystery_3',
 'Historical Fiction': 'historical-fiction_4',
 'Sequential Art': 'sequential-art_5',
 'Classics': 'classics_6',
 'Philosophy': 'philosophy_7',
 'Romance': 'romance_8',
 'Womens Fiction': 'womens-fiction_9',
 'Fiction': 'fiction_10',
 'Childrens': 'childrens_11',
 'Religion': 'religion_12',
 'Nonfiction': 'nonfiction_13',
 'Music': 'music_14',
 'Default': 'default_15',
 'Science Fiction': 'science-fiction_16',
 'Sports and Games': 'sports-and-games_17',
 'Add a comment': 'add-a-comment_18',
 'Fantasy': 'fantasy_19',
 'New Adult': 'new-adult_20',
 'Young Adult': 'young-adult_21',
 'Science': 'science_22',
 'Poetry': 'poetry_23',
 'Paranormal': 'paranormal_24',
 'Art': 'art_25',
 'Psychology': 'psychology_26',
 'Autobiography': 'autobiography_27',
 'Parenting': 'parenting_28',
 'Adult Fiction': 'adult-fiction_29',
 'Humor': 'humor_30',
 'Horror': 'horror_31',
 'History': 'history_32',
 'Food and Drink': 'food-and-drink_33',
 'Christian Fict

In [125]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_categories():
    base_url = 'https://books.toscrape.com/'
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'lxml')
    categories = soup.find('ul', class_='nav nav-list').find('ul').find_all('li')
    categories_cleaned = [category.a.text.strip() for category in categories]
    categories_simplified = [category.lower().replace(' ', '-') + '_' + str(i+2) for i, category in enumerate(categories_cleaned)]
    return {k: v for k, v in zip(categories_cleaned, categories_simplified)}

def get_category_url(category):
    base_url = 'https://books.toscrape.com/catalogue/category/books/'
    category_key = get_categories().get(category)
    if category_key:
        return f'{base_url}{category_key}/index.html'
    else:
        raise ValueError(f'Category "{category}" not found in the dictionary.')

def scrape_books(category):
    try:
        url = get_category_url(category)
    except ValueError as e:
        print(e)
        return None
    else:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'lxml')

        num_of_books = int(soup.find_all('strong')[1].text)
        num_of_pages = num_of_books // 20 + num_of_books % 20

        all_books_df = pd.DataFrame()

        for page in range(1, num_of_pages + 1):
            if page == 1:
                page_url = url
            else:
                page_url = f'{url[:-10]}page-{page}.html'

            response = requests.get(page_url)
            soup = BeautifulSoup(response.content, 'lxml')

            names = soup.find_all('h3')
            names_lst = [name.a['title'] for name in names]

            prices = soup.find_all('p', 'price_color')
            prices_lst = [price.text.strip('£') for price in prices]

            ratings = soup.find_all('p', 'star-rating')
            ratings_lst =  [rating['class'][1] for rating in ratings]
            rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
            ratings_lst =  [rating_map[rating] for rating in ratings_lst]

            books_dict = {
                'Book Name': names_lst,
                'Price': prices_lst,
                'Rating': ratings_lst
            }

            books_df = pd.DataFrame(books_dict)

            all_books_df = pd.concat([all_books_df, books_df], ignore_index=True)

        return all_books_df

In [128]:
scrape_books('Default')

Unnamed: 0,Book Name,Price,Rating
0,The Coming Woman: A Novel Based on the Life of...,17.93,3.0
1,The Boys in the Boat: Nine Americans and Their...,22.60,4.0
2,"Starving Hearts (Triangular Trade Trilogy, #1)",13.99,2.0
3,America's Cradle of Quarterbacks: Western Penn...,22.50,3.0
4,Aladdin and His Wonderful Lamp,53.13,3.0
...,...,...,...
147,Shatter Me (Shatter Me #1),42.40,1.0
148,Paradise Lost (Paradise #1),24.96,1.0
149,On the Road (Duluoz Legend),32.36,3.0
150,Jane Eyre,38.43,5.0
