# `Wine Scraping` 🍷
***

![Image](https://img.passeportsante.net/1200x675/2021-05-03/i102200-vin.webp)

In [288]:
from bs4 import BeautifulSoup as BS
from pathlib import Path
import polars as pl
import re
from dataclasses import dataclass
from serde import serialize
from serde.json import to_json
from rich import print

In [2]:
racine = Path(".").resolve()
print(racine)

D:\Cours Mecen (M2)\Machine Learning\Wine Scraping


In [3]:
test_pages = racine / "test_pages"
print(test_pages)

D:\Cours Mecen (M2)\Machine Learning\Wine Scraping\test_pages


In [5]:
htmls = list()
for chemin in test_pages.glob("*.html"):
    htmls.append(chemin.read_text(encoding="utf8"))

In [490]:
soupe = BS(htmls[0])

In [468]:
type_test = soupe.find_all(name="title")

In [470]:
type_test = [x.text for x in type_test]

In [471]:
type_test

['🍇 Achat VIN ROUGE : découvrez notre sélection des meilleurs vins rouges']

In [472]:
type_test[0].split(":")[0].strip()#[type[0].split(":")[0].find("VIN"):]

'🍇 Achat VIN ROUGE'

In [514]:
def extract_type(text: str) -> str:
    type = text.split(":")[0].strip()[text.split(":")[0].find("VIN"):]
    return type

# NOM / ANNEE / LIEU

In [233]:
result_items = soupe.find_all(name="div", attrs={"class": "vue-product-name"})

In [228]:
def extract_name(text:str) -> str:
    """
    Extract the `name` of the wine from unstructured text.
    """
    name = text.replace('\xa0', '_').split("_")[0]
    return name

In [230]:
result_title = [extract_name(x.text) for x in result_items]

In [241]:
def extract_capacity(text:str) -> float:
    """
    Extract the `capacity` of the wine from unstructured text.
    """
    capacity = float(
        text.replace('\xa0', '_')[text.replace('\xa0', '_').find("_")+1:] # remplacement des valeurs xa0 par _
        .replace("L","") # remplacement de l'indicateur litres "L" par rien
        .strip() # on retire ensuite les caractères vides
        )
    return capacity

In [242]:
capacity = [extract_capacity(x.text) for x in result_items]

In [243]:
def extract_year(text:str) -> int | None:
    """
    Extract the `year` of the wine from unstructured text using a regular expression pattern.
    """
    match = re.search(r'\b\d{4}\b', text)
    if match:
        return int(match.group())
    else:
        return None

In [244]:
years = [extract_year(item) for item in result_title]

# PRIX

In [198]:
result_find_price = soupe.find_all(name="div", attrs={"class": "vue-product-prices"})

In [199]:
result_price = [x.text for x in result_find_price]

In [None]:
result_price[0]

In [140]:
def extract_promo(text):
    match = re.search(r'-\d+%', text)
    if match:
        return match.group()
    else:
        return None

In [290]:
def extract_prix_vente(text):
    match = re.search(r'\d+,\d+', text)
    if match:
        return float(match.group().replace(",","."))
    else:
        return None

In [291]:
def extract_prix_apres_promo(text):
    match = re.search(r'€(\d+,\d+)', text)
    if match:
        return match.group(1).replace(",",".")
    else:
        return None

In [None]:
result_price

In [None]:
[extract_prix_vente(item) for item in result_price]

# Attention aux prix à l'unité et aux promos groupées.

# c bon ou pas mon reuf ? (autrement appelé **AVIS**)
quand-est ce qu'on se descend une teille ?

In [474]:
result_find_avis = soupe.find_all(name="div", attrs={"class": "vue-avis-block"})

In [475]:
result_avis = [x.text for x in result_find_avis]

In [None]:
result_avis

In [526]:
def extract_note(text: str) -> float | None:
    match = re.search(r'([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*))(?:[Ee]([+-]?\d+))?/20', text[:10])
    if match:
        return float(match.group(1))
    else:
        return None

In [527]:
def extract_nb_avis(text: str) -> int:
    match = re.search(r'(\d+) notes', text)
    if match:
        return int(match.group(1))
    else:
        return 0

In [None]:
[extract_note(item) for item in result_avis]

# La soupe ça fait grandir ?

In [520]:
def extract_result(soupe: BS) -> dict:
    result_find_title = soupe.find_all(name="div", attrs={"class": "vue-product-name"})
    result_title = [x.text for x in result_find_title]
    result_find_avis = soupe.find_all(name="div", attrs={"class": "vue-avis-block"})
    result_avis = [x.text for x in result_find_avis]
    result_find_price = soupe.find_all(name="div", attrs={"class": "vue-product-prices"})
    result_price = [x.text for x in result_find_price]
    result_find_type = soupe.find_all(name="title")
    result_type = [x.text for x in result_find_type] * len(result_title) # comme il n'y a qu'un seul résultat
    return {
        "result_title": result_title,
        "result_price": result_price,
        "result_avis": result_avis,
        "result_type": result_type
    }

# 💥 FUSIONNNNNNNNNNNNNNNN 💥

In [516]:
@serialize
@dataclass
class Vin:
    """This dataclass represents all characteristics associated with a Wine."""
    name: str
    capacity: float
    year: int
    price: str
    promo: str
    prix_promo: str
    note : float
    nb_avis : int
    type: str

In [523]:
def extract_vin(resultset_1, resultset_2, resultset_3 ,resultset_4) -> Vin:
    name = extract_name(resultset_1)
    capacity = extract_capacity(resultset_1)
    year = extract_year(resultset_1)
    price = extract_prix_vente(resultset_2)
    promo = extract_promo(resultset_2)
    prix_promo = extract_prix_apres_promo(resultset_2)
    note = extract_note(resultset_3)
    nb_avis = extract_nb_avis(resultset_3)
    type = extract_type(resultset_4)
    
    return Vin(
        name = name,
        capacity = capacity,
        year = year,
        price = price,
        promo = promo,
        prix_promo=prix_promo,
        note = note,
        nb_avis = nb_avis,
        type = type
    )

In [518]:
vins = [
    extract_vin(item_1, item_2, item_3, item_4) 
    for (item_1, item_2, item_3, item_4) in 
    zip(
        extract_result(soupe)["result_title"], 
        extract_result(soupe)["result_price"], 
        extract_result(soupe)["result_avis"],
        extract_result(soupe)["result_type"])
]
