In [108]:
import time

import pandas as pd
import requests

from bs4 import BeautifulSoup
from tqdm.auto import tqdm

# BeautifulSoup Tutorial

In [17]:
html = """<h1>Hi, this is a title</h1>
<h2>This is a subtitle</h2>

<p>This is a paragraph</p>"""

In [18]:
example_soup = BeautifulSoup(html)

# bs4.BeautifulSoup.find() allows us to retrieve the text
example_soup.find("h1").text

'Hi, this is a title'

# tqdm tutorial

In [100]:
for el in tqdm(range(10)):
  pass

  0%|          | 0/10 [00:00<?, ?it/s]

# Scrape Wines

## Carrefour

In [5]:
my_headers = {
    # stating that I prefer text and images
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    # language I prefer
    "Accept-Language": 	"en-GB,en;q=0.5",
    # from which site we are coming from
    "Referer": "https://www.google.com/",
    # User-Agent specifies the browser making the http request
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0"
    }

In [91]:
def scrape_carrefour_wines():
  """
  Scrape carrefour.fr and return a dict of wine names and their prices.
  """
  carrefour_url = "https://www.carrefour.fr/s?q=vin"
  response = requests.get(carrefour_url, headers=my_headers)
  soup = BeautifulSoup(response.text, "lxml")
  wines_info = dict()

  wines = soup.find_all("div", {"class": "ds-product-card--vertical-infos"})
  for wine in wines:
    wine_name = wine.find("h2", {"class": "ds-title"}).text.strip()
    price_per_liter = wine.find_all("div", {"class": "ds-body-text"})[1].text.strip()
    price_per_liter = float(price_per_liter.split(" € / Litre")[0])
    wines_info[wine_name] = price_per_liter

  return wines_info

In [92]:
carrefour_wines = scrape_carrefour_wines()

In [93]:
carrefour_wines

{"Vin AOP rouge  de bordeaux LA CAVE D'AUGUSTIN FLORENT": 8.13,
 'Vin Rouge  Beaujolais AOP Juliénas Domaine de la Combe Darroux "Reflets de France"': 9.13,
 'Vin Rouge  Bordeaux AOP Blaye Côtes de Bordeaux Château Labrousse "Reflets de France"': 8.6,
 'Vin Rouge  Bordeaux AOP Blaye Côtes de Bordeaux Château Le Grand Moulin': 7.73,
 'Vin Rouge  Bordeaux AOP Bordeaux Carrefour Sélection "Cabernet Franc et Malbec"': 4.73,
 'Vin Rouge  Bordeaux AOP Bordeaux Château Hautes Terres "Reflets De France"': 5.73,
 "Vin Rouge  Bordeaux AOP Bordeaux La Cave d'Augustin Florent": 4.4,
 'Vin Rouge  Bordeaux AOP Bordeaux La Petite Chapelle De Bordeaux': 3.69,
 'Vin Rouge  Bordeaux AOP Bordeaux Supérieur Château Picon "Reflets de France"': 6.93,
 "Vin Rouge  Bordeaux AOP Bordeaux Supérieur La Cave d'Augustin Florent": 5.32,
 "Vin Rouge  Bordeaux AOP Haut Médoc La Cave d'Augustin Florent": 7.53,
 'Vin Rouge  Bordeaux AOP Lussac Saint-Emilion L de Lussac': 8.53,
 'Vin Rouge  Bordeaux AOP Médoc Château Bo

## Vivino

In [95]:
def scrape_vivino(wine_name):
  """
  Scrape vivino.com for info about wine and return a dict with keys:
    Vivino Name
    Rating
    # Reviews
  """
  search_url = "https://www.vivino.com/search/wines?q="
  url = search_url + wine_name.lower().replace("  ", "+").replace(" ", "+")
  response = requests.get(url, headers=my_headers)
  soup = BeautifulSoup(response.text)
  wine_name = soup.find("span", {"class": "header-smaller"}).text.strip()
  rating = soup.find("div", {"class": "text-inline-block"}).text.strip()
  n_reviews = int(soup.find("p", {"class": "text-micro"}).text.strip().split(" ratings")[0])

  return {
      "Vivino Name": wine_name,
      "Rating": rating,
      "# Reviews": n_reviews
  }

In [96]:
scrape_vivino(wine_name)

{'# Reviews': 5809,
 'Rating': '3.5',
 'Vivino Name': 'Spier Vintage Selection Cabernet Sauvignon - Merlot - Cabernet Franc - Petit Verdot - Malbec'}

# Copy only the first 5 wines

In [104]:
first_5 = dict()

count = 0
for key in carrefour_wines:
  first_5[key] = carrefour_wines[key]
  count += 1
  if count == 5:
    break

In [105]:
first_5

{'Vin Rouge  Bordeaux AOP Lussac Saint-Emilion L de Lussac': 8.53,
 'Vin Rouge  Vallée Du Rhône AOP Crozes Hermitage Les 3 Lys': 11.0,
 'Vin rouge AOP Côtes du Rhône BIO MONTALCOUR': 5.19,
 'Vin rouge Bordeaux LE GRAND ECUYER': 5.53,
 'Vin rouge Brouilly  CAVE AUGUSTIN FLORENT': 7.53}

In [106]:
final = dict()
for wine in tqdm(first_5):
  wine_info = scrape_vivino(wine)
  wine_info["Price per liter"] = first_5[wine]
  final[wine] = wine_info

  0%|          | 0/5 [00:00<?, ?it/s]

In [107]:
final

{'Vin Rouge  Bordeaux AOP Lussac Saint-Emilion L de Lussac': {'# Reviews': 124,
  'Price per liter': 8.53,
  'Rating': '—',
  'Vivino Name': 'Chateau Langlaise Grand Vin De Bordeaux Lussac Saint Emilion'},
 'Vin Rouge  Vallée Du Rhône AOP Crozes Hermitage Les 3 Lys': {'# Reviews': 30,
  'Price per liter': 11.0,
  'Rating': '3.5',
  'Vivino Name': 'Les Monts Taris Crozes Hermitage'},
 'Vin rouge AOP Côtes du Rhône BIO MONTALCOUR': {'# Reviews': 899,
  'Price per liter': 5.19,
  'Rating': '3.3',
  'Vivino Name': 'Cellier des Dauphins Vin Bio Côtes-du-Rhône'},
 'Vin rouge Bordeaux LE GRAND ECUYER': {'# Reviews': 309,
  'Price per liter': 5.53,
  'Rating': '3.3',
  'Vivino Name': 'Bertrand Ravache Le Grand Ecuyer Bordeaux Supérieur'},
 'Vin rouge Brouilly  CAVE AUGUSTIN FLORENT': {'# Reviews': 1689,
  'Price per liter': 7.53,
  'Rating': '3.3',
  'Vivino Name': "La Cave d'Augustin Florent Vin d'Alsace Pinot Noir"}}

In [109]:
wine_table = pd.DataFrame.from_dict(final, orient='index')

In [110]:
wine_table

Unnamed: 0,Vivino Name,Rating,# Reviews,Price per liter
Vin Rouge Bordeaux AOP Lussac Saint-Emilion L de Lussac,Chateau Langlaise Grand Vin De Bordeaux Lussac...,—,124,8.53
Vin Rouge Vallée Du Rhône AOP Crozes Hermitage Les 3 Lys,Les Monts Taris Crozes Hermitage,3.5,30,11.0
Vin rouge Brouilly CAVE AUGUSTIN FLORENT,La Cave d'Augustin Florent Vin d'Alsace Pinot ...,3.3,1689,7.53
Vin rouge Bordeaux LE GRAND ECUYER,Bertrand Ravache Le Grand Ecuyer Bordeaux Supé...,3.3,309,5.53
Vin rouge AOP Côtes du Rhône BIO MONTALCOUR,Cellier des Dauphins Vin Bio Côtes-du-Rhône,3.3,899,5.19


In [111]:
file_name = "carrefour_wines.xlsx"
wine_table.to_excel(file_name)

try:
  from google.colab import files
  files.download(file_name)
except ModuleNotFoundError:
  print("You are not on Colab, so no need to download")
  pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>