# ============== WEB SCRAPING BOOKS.TOSCRAPE.COM ================

In [1]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random


In [None]:
# Liste pour stocker tous les livres
all_books = []

# URL de base
base_url = "http://books.toscrape.com/catalogue/page-{}.html"
# Il y a 50 pages
for page in range(1, 51):
    print(f"Scraping page {page}/50...")
    url = base_url.format(page)
    response = requests.get(url)
    
    if response.status_code != 200:
        print("Erreur de connexion à la page", page)
        continue
        
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Tous les livres de la page
    books = soup.find_all('article', class_='product_pod')
    
    for book in books:
        title = book.h3.a['title']
        price = book.find('p', class_='price_color').text[1:]  
        price = float(price)
        
        # Rating (One, Two, Three, Four, Five)
        rating_class = book.find('p', class_='star-rating')['class'][1]
        rating_map = {'One':1, 'Two':2, 'Three':3, 'Four':4, 'Five':5}
        rating = rating_map.get(rating_class, 0)
        
        # Disponibilité
        stock = book.find('p', class_='instock availability').text.strip()
        in_stock = "In stock" in stock
        
        # Lien détaillé
        book_url = "http://books.toscrape.com/catalogue/" + book.h3.a['href'].replace('../../../', '')
        
        # Catégorie (on va chercher dans la page détaillée)
        detail_response = requests.get(book_url)
        detail_soup = BeautifulSoup(detail_response.content, 'html.parser')
        category = detail_soup.find('ul', class_='breadcrumb').find_all('li')[2].text.strip()
        
        all_books.append({
            'Title': title,
            'Price (£)': price,
            'Rating (1-5)': rating,
            'In Stock': in_stock,
            'Category': category,
            'URL': book_url
        })
    
    # Politeness : petite pause pour ne pas surcharger le serveur
    time.sleep(random.uniform(1, 2))

# Conversion en DataFrame
df = pd.DataFrame(all_books)
print(f"Scraping terminé ! {len(df)} livres collectés.")

# Sauvegarde
df.to_csv('../data/books_toscrape_complete.csv', index=False)
df.to_json('../data/books_toscrape_complete.json', orient='records', indent=4)

print("Fichiers sauvegardés : books_toscrape_complete.csv et .json")

Scraping page 1/50...
Scraping page 2/50...
Scraping page 3/50...
Scraping page 4/50...
Scraping page 5/50...
Scraping page 6/50...
Scraping page 7/50...
Scraping page 8/50...
Scraping page 9/50...
Scraping page 10/50...
Scraping page 11/50...
Scraping page 12/50...
Scraping page 13/50...
Scraping page 14/50...
Scraping page 15/50...
Scraping page 16/50...
Scraping page 17/50...
Scraping page 18/50...
Scraping page 19/50...
Scraping page 20/50...
Scraping page 21/50...
Scraping page 22/50...
Scraping page 23/50...
Scraping page 24/50...
Scraping page 25/50...
Scraping page 26/50...
Scraping page 27/50...
Scraping page 28/50...
Scraping page 29/50...
Scraping page 30/50...
Scraping page 31/50...
Scraping page 32/50...
Scraping page 33/50...
Scraping page 34/50...
Scraping page 35/50...
Scraping page 36/50...
Scraping page 37/50...
Scraping page 38/50...
Scraping page 39/50...
Scraping page 40/50...
Scraping page 41/50...
Scraping page 42/50...
Scraping page 43/50...
Scraping page 44/50.