In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
from urllib.parse import urljoin
from datetime import datetime
from os import name


In [2]:
homepage_url = "http://books.toscrape.com/"
homepage_response = requests.get(homepage_url)
homepage_response.encoding = 'utf-8' #per ruajtjen e cmimit
homepage_soup = BeautifulSoup(homepage_response.text, 'html.parser')

In [3]:
# getting a category list with the category name and the corresponding link

def get_categories():
  response = requests.get(homepage_url)
  response.encoding = 'utf-8'
  soup = BeautifulSoup(response.text, 'html.parser')
  categories_tags = soup.select_one('div.side_categories > ul > li > ul').find_all('li', recursive = False)
  categories = {}
  for tag in categories_tags:
    name = tag.text.strip()
    link = urljoin(homepage_url, tag.find('a')['href'])
    categories[name] = link
  return categories

In [18]:
books={}

#get all books from all pages of a specific category
def scrape_category(category_name, start_url):
  current_url = start_url

  while current_url:
    response = requests.get(current_url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

    for article in soup.find_all('article', class_='product_pod'):
      title = article.h3.a['title']
      #if book is already scraped, then we add the other category
      if title in books:
        books[title]['Categories'].add(category_name)
      else:
        price = article.find('p', class_='price_color').text
        clean_price = float(re.findall(r'[0-9.]+', price)[0])
        #adds all books of the current page
        books[title] = {
            'Title': title,
            'Price (GBP)': clean_price,
            'Categories': {category_name}
        }
# also checking for pagination, if there's other pages, we scrape those too
    next_button = soup.find('li', class_='next')
    if next_button:
      relative_link = next_button.find('a')['href']
      current_url = urljoin(current_url, relative_link)
    else:
      current_url = None

  return books

all_categories = get_categories()
for category_name, link in all_categories.items():
  print(f"Scraping category: {category_name}")
  scrape_category(category_name, link)

Scraping category: Travel
Scraping category: Mystery
Scraping category: Historical Fiction
Scraping category: Sequential Art
Scraping category: Classics
Scraping category: Philosophy
Scraping category: Romance
Scraping category: Womens Fiction
Scraping category: Fiction
Scraping category: Childrens
Scraping category: Religion
Scraping category: Nonfiction
Scraping category: Music
Scraping category: Default
Scraping category: Science Fiction
Scraping category: Sports and Games
Scraping category: Add a comment
Scraping category: Fantasy
Scraping category: New Adult
Scraping category: Young Adult
Scraping category: Science
Scraping category: Poetry
Scraping category: Paranormal
Scraping category: Art
Scraping category: Psychology
Scraping category: Autobiography
Scraping category: Parenting
Scraping category: Adult Fiction
Scraping category: Humor
Scraping category: Horror
Scraping category: History
Scraping category: Food and Drink
Scraping category: Christian Fiction
Scraping category: 

In [19]:
def get_exchange_rate(api_key, base_currency, target_currency):
  url = f"https://v6.exchangerate-api.com/v6/{api_key}/latest/{base_currency}"
  response = requests.get(url)
  data = response.json()

  if data["result"] == "success":
    return data["conversion_rates"][target_currency]
  else:
    print("Error fetching API data!")
    return None

API_KEY = ""

rate = get_exchange_rate(API_KEY, "GBP", "EUR")

print("Exchange Rate GBP->EUR: ", rate)


Exchange Rate GBP->EUR:  1.151


In [21]:
final_list = list(books.values())
for item in final_list:
  item['Categories'] = ', '.join(sorted(item['Categories']))

df = pd.DataFrame(final_list)

#shtohet kolona per ID
df.insert(0, 'ID', range(1, 1 + len(df)))

#shtohet kolona per cmimin ne EUR
df['Price (EUR)'] = (df['Price (GBP)'] * rate).round(2)

#shtohet kolona per daten e kembimit
df['Exchange Date'] = datetime.now().strftime("%Y-%m-%d")

#krijohet file .csv
df.to_csv('books_data.csv', index=False, encoding='utf-8')
print(df)

      ID                                              Title  Price (GBP)  \
0      1                            It's Only the Himalayas        45.17   
1      2  Full Moon over Noahâ€™s Ark: An Odyssey to Mount...        49.43   
2      3  See America: A Celebration of Our National Par...        48.87   
3      4  Vagabonding: An Uncommon Guide to the Art of L...        36.94   
4      5                               Under the Tuscan Sun        37.33   
..   ...                                                ...          ...   
994  995  Why the Right Went Wrong: Conservatism--From G...        52.65   
995  996  Equal Is Unfair: America's Misguided Fight Aga...        56.86   
996  997                                     Amid the Chaos        36.58   
997  998                                         Dark Notes        19.19   
998  999  The Long Shadow of Small Ghosts: Murder and Me...        10.97   

                Categories  Price (EUR) Exchange Date  
0    Test Category, Travel   

In [20]:
# Manual test: Add 'Test Category' to the first book found
first_book_title = list(books.keys())[0]
books[first_book_title]['Categories'].add('Test Category')