# Démo - scraping

## Imports

In [1]:
from bs4 import BeautifulSoup  # parser de HTML/XML
import requests  # pour la query HTTP
import pandas as pd # analyse de données, sauvegarde au format excel/CSV/JSON/Parquet etc. (on pourrait n'utiliser que la lib csv)

In [2]:
URL = "https://books.toscrape.com/"  # stockée dans une constante

# TO DO: visiter le site ci-dessus, regarder son code

## Récupérer les noms des catégories (colonne de gauche)

In [3]:
def get_categories() -> None:  # type hint
    """Capture les noms de catégories de la colonne de gauche"""
    r = requests.get(URL)
    if (rsc := r.status_code) != 200:
        print(f"Erreur, code {rsc}")
    else:
        html = r.content
        soup = BeautifulSoup(html, 'html.parser')
        categories = []
        for elem in soup.find_all('a'):
            if elem.get('href', '').startswith('catalogue/category/books/'):
                category = elem.text.strip()
                categories.append(category)
        print(f"{len(categories)} categories found:\n {categories}")

In [4]:
get_categories.__doc__

'Capture les noms de catégories de la colonne de gauche'

In [5]:
get_categories()

50 categories found:
 ['Travel', 'Mystery', 'Historical Fiction', 'Sequential Art', 'Classics', 'Philosophy', 'Romance', 'Womens Fiction', 'Fiction', 'Childrens', 'Religion', 'Nonfiction', 'Music', 'Default', 'Science Fiction', 'Sports and Games', 'Add a comment', 'Fantasy', 'New Adult', 'Young Adult', 'Science', 'Poetry', 'Paranormal', 'Art', 'Psychology', 'Autobiography', 'Parenting', 'Adult Fiction', 'Humor', 'Horror', 'History', 'Food and Drink', 'Christian Fiction', 'Business', 'Biography', 'Thriller', 'Contemporary', 'Spirituality', 'Academic', 'Self Help', 'Historical', 'Christian', 'Suspense', 'Short Stories', 'Novels', 'Health', 'Politics', 'Cultural', 'Erotica', 'Crime']


## Récupérer les titres des livres

In [20]:
def get_titles() -> None:
    """Capture les titres de livres"""
    r = requests.get(URL)
    if (rsc := r.status_code) != 200:
        print(f"Erreur, code {rsc}")
    else:
        html = r.content
        soup = BeautifulSoup(html, 'html.parser')
        titles = []
        for elem in soup.find_all('h3'):
            try:
                if elem.a.get('href', '').startswith('catalogue/'):
                    titles.append(elem.a.get('title','').strip())
            except AttributeError:
                continue
        print(f"{len(titles)} titles found:\n {titles}")

In [21]:
get_titles()

20 titles found:
 ['A Light in the Attic', 'Tipping the Velvet', 'Soumission', 'Sharp Objects', 'Sapiens: A Brief History of Humankind', 'The Requiem Red', 'The Dirty Little Secrets of Getting Your Dream Job', 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull', 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics', 'The Black Maria', 'Starving Hearts (Triangular Trade Trilogy, #1)', "Shakespeare's Sonnets", 'Set Me Free', "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", 'Rip it Up and Start Again', 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991', 'Olio', 'Mesaerion: The Best Science Fiction Stories 1800-1849', 'Libertarianism for Beginners', "It's Only the Himalayas"]


## Récupérer les titres des livres **des 3 premières pages**

In [25]:
from pprint import pprint

def get_titles_multipage(page_start: int, page_end: int) -> None:
    """Capture les titres de livres de la page `page_start` à la page `page_end`"""

    # on remarque un pattern d'URL: https://books.toscrape.com/catalogue/page-2.html
    URL = 'https://books.toscrape.com/catalogue/page-{}.html'
    titles = {nb: [] for nb in range(page_start, page_end + 1)}  # or use a defaultdict(list)

    for page_nb in range(page_start, page_end + 1):
      url = URL.format(page_nb)
      print(f"Scraping {url}...")
      r = requests.get(url)
      if (rsc := r.status_code) != 200:
          print(f"Erreur, code {rsc}")
      else:
          html = r.content
          soup = BeautifulSoup(html, 'html.parser')

          for elem in soup.find_all('h3'):
              try:
                  if elem.a.get('href'):  # this time there is no "/catalogue"
                      titles[page_nb].append(elem.a.get('title','').strip())
              except AttributeError:
                  continue
    pprint(titles)

In [None]:
get_titles_multipage(1, 3)

## Quel est le coût total des 4 premiers livres?

In [30]:
def get_total_price(n_books : int = 4) -> None:
    """Calcule la somme des prix des `n_books` premiers livres"""
    r = requests.get(URL)
    if (rsc := r.status_code) != 200:
        print(f"Erreur, code {rsc}")
    else:
        html = r.content
        soup = BeautifulSoup(html, 'html.parser')
        prices = []
        for elem in soup.find_all('div', class_='product_price')[:n_books]:
            try:
                prices.append(elem.p.text.strip()[1:])  # drop £ sign
            except AttributeError:
                continue
        prices = [float(val) for val in prices]  # or list(map(float, prices))
        print(f"{prices=}, sum : {sum(prices)}")

In [31]:
get_total_price()

prices=[51.77, 53.74, 50.1, 47.82], sum : 203.43


## Stocker en local l'image de couverture de "Sapiens"

In [33]:
def store_bookcover() -> None:
    r = requests.get(URL)
    if (rsc := r.status_code) != 200:
        print(f"Erreur, code {rsc}")
    else:
        html = r.content
        soup = BeautifulSoup(html, 'html.parser')
        for elem in soup.find_all('div', class_="image_container"):
          if "sapiens" in elem.a.get('href', '').lower():
              img_url = "".join(['https://books.toscrape.com/', elem.a.img.get('src', '')])
              print(f"{img_url=}")
              img_req = requests.get(img_url)
              with open('sapiens.jpg', 'wb') as f:
                  f.write(img_req.content)
              break
        else:
          print('No bookcover from Sapiens found')

In [34]:
store_bookcover()

img_url='https://books.toscrape.com/media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg'
