# Carrefour Wine

Always put imports at the top

In [1]:
import time   # always put imports from standard lib at the top

import pandas as pd
import requests

from bs4 import BeautifulSoup

### You can either get Carrefour data from https://www.carrefour.fr/r/boissons/cave-a-vins by hand...
This will give error if you don't save CarrefourSource.html in the same folder, so I have converted next cell to raw

### Or you can do it with requests

In [2]:
# url for first page of wines sold on carrefour.fr
carrefour_url = "https://www.carrefour.fr/r/boissons/cave-a-vins"

resp = requests.get(carrefour_url,
                    # headers to pretend we are using Firefox rather than being a bot
                    headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"})
html = resp.text

In [3]:
soup = BeautifulSoup(html)

In [4]:
# By inspecting Carrefour.fr, we have seen that all wines' names were encapsulated in a h2 tag
wine_names = soup.find_all("h2")

In [5]:
# We need only the text
wine_names_text = [wine_name.text.strip() for wine_name in wine_names]

In [6]:
# Let's reduce to 4 wines for now, or Vivino.com will block us for too many requests
wine_names_text = wine_names_text[:4]
print(wine_names_text)

['Vin Rouge Bordeaux Lussac Saint Emilion Merlot - Cabernet Franc  - Cabernet Sauvignon L DE LUSSAC', "Vin Rouge Sud Ouest Bergerac Merlot - Cabernet Sauvignon - Cabernet Franc LA CAVE D'AUGUSTIN FLORENT", "Vin Rouge Vallée du Rhône Côtes du Rhône Grenache - Syrah LA CAVE D'AUGUSTIN FLORENT", 'Vin Rouge Vins étrangers Chili Carmenere CONCHA Y TORO CASILLERO DEL DIABLO CARMENERE']


In [7]:
# def vivino_scrape(wine_name):
def vivino_scrape(wine_name: str):
    """
    Return dictionary about wine_name scraped from vivino.com.
    :return: {"VivinoName": str,
              "VivinoPrice": str,
              "Rating": str,
              "# Reviews": str}
    """
    # We have seen that Vivino replaces spaces in our wine name with "+", so we do the same
    url = "https://www.vivino.com/search/wines?q=" + wine_name.replace(" ", "+")
    
    # User-Agent to pretend be Firefox and not Python => pretend to be a real user rather than a bot
    # Accept-Language to only get English (necessary for .replace("ratings", ""))
    vivino_headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0",
                      "Accept-Language": "en-US,en;q=0.5"}
    
    # without headers, Vivino will see that you are a bot and will block you =(
    response = requests.get(url, headers=vivino_headers)
    html = response.text
    
    soup = BeautifulSoup(html)
    
    # using "Inspect element" from the browser, we identified the right tags for each element
    rating = soup.find("div", {"class": "text-inline-block light average__number"}).text.strip()
    name = soup.find("span", {"class": "bold"}).text.strip()
    price = soup.find("span", {"class": "wine-price-value"}).text.strip()
    number_reviews = soup.find("p", {"class": "text-micro"}).text.strip().replace("ratings", "")
    
    return {"VivinoName": name, "VivinoPrice": price, "Rating": rating, "# Reviews": number_reviews}

In [8]:
# text between """""" (so-calles "docstring") is not necessary makes function easier to understand
# docstrings are also returned by the help function
help(vivino_scrape)

Help on function vivino_scrape in module __main__:

vivino_scrape(wine_name: str)
    Return dictionary about wine_name scraped from vivino.com.
    :return: {"VivinoName": str,
              "VivinoPrice": str,
              "Rating": str,
              "# Reviews": str}



In [9]:
data = dict()

for w in wine_names_text:
    print("Scraping data from Vivino:", w)
    data[w] = vivino_scrape(w)
    
    # stop everything for 1.5 seconds
    # prevent overloading website and to be blocked
    time.sleep(1.5)

Scraping data from Vivino: Vin Rouge Bordeaux Lussac Saint Emilion Merlot - Cabernet Franc  - Cabernet Sauvignon L DE LUSSAC
Scraping data from Vivino: Vin Rouge Sud Ouest Bergerac Merlot - Cabernet Sauvignon - Cabernet Franc LA CAVE D'AUGUSTIN FLORENT
Scraping data from Vivino: Vin Rouge Vallée du Rhône Côtes du Rhône Grenache - Syrah LA CAVE D'AUGUSTIN FLORENT
Scraping data from Vivino: Vin Rouge Vins étrangers Chili Carmenere CONCHA Y TORO CASILLERO DEL DIABLO CARMENERE


In [10]:
print(data)

{'Vin Rouge Bordeaux Lussac Saint Emilion Merlot - Cabernet Franc  - Cabernet Sauvignon L DE LUSSAC': {'VivinoName': 'Passeport Saint-Emilion Merlot - Cabernet Franc', 'VivinoPrice': '—', 'Rating': '3,5', '# Reviews': '3419 '}, "Vin Rouge Sud Ouest Bergerac Merlot - Cabernet Sauvignon - Cabernet Franc LA CAVE D'AUGUSTIN FLORENT": {'VivinoName': "La Cave d'Augustin Florent Merlot - Cabernet Sauvignon", 'VivinoPrice': '—', 'Rating': '3,3', '# Reviews': '84 '}, "Vin Rouge Vallée du Rhône Côtes du Rhône Grenache - Syrah LA CAVE D'AUGUSTIN FLORENT": {'VivinoName': "La Cave d'Augustin Florent Les Pierrasques Côtes du Rhône", 'VivinoPrice': '—', 'Rating': '3,2', '# Reviews': '2946 '}, 'Vin Rouge Vins étrangers Chili Carmenere CONCHA Y TORO CASILLERO DEL DIABLO CARMENERE': {'VivinoName': 'Casillero del Diablo Carmenere (Reserva)', 'VivinoPrice': '—', 'Rating': '3,6', '# Reviews': '66273 '}}


## Convert data to Excel file
We are going to use pandas

In [11]:
# create a df from a dictionary
df = pd.DataFrame.from_dict(data, 
                            orient='index')

### Dataframes look nicer than dictionaries

In [12]:
df

Unnamed: 0,VivinoName,VivinoPrice,Rating,# Reviews
Vin Rouge Bordeaux Lussac Saint Emilion Merlot - Cabernet Franc - Cabernet Sauvignon L DE LUSSAC,Passeport Saint-Emilion Merlot - Cabernet Franc,—,35,3419
Vin Rouge Sud Ouest Bergerac Merlot - Cabernet Sauvignon - Cabernet Franc LA CAVE D'AUGUSTIN FLORENT,La Cave d'Augustin Florent Merlot - Cabernet S...,—,33,84
Vin Rouge Vallée du Rhône Côtes du Rhône Grenache - Syrah LA CAVE D'AUGUSTIN FLORENT,La Cave d'Augustin Florent Les Pierrasques Côt...,—,32,2946
Vin Rouge Vins étrangers Chili Carmenere CONCHA Y TORO CASILLERO DEL DIABLO CARMENERE,Casillero del Diablo Carmenere (Reserva),—,36,66273


### If you don't like that CarrefourName is in the index, we can move it to a column...

In [13]:
df = df.reset_index()
df

Unnamed: 0,index,VivinoName,VivinoPrice,Rating,# Reviews
0,Vin Rouge Bordeaux Lussac Saint Emilion Merlot...,Passeport Saint-Emilion Merlot - Cabernet Franc,—,35,3419
1,Vin Rouge Sud Ouest Bergerac Merlot - Cabernet...,La Cave d'Augustin Florent Merlot - Cabernet S...,—,33,84
2,Vin Rouge Vallée du Rhône Côtes du Rhône Grena...,La Cave d'Augustin Florent Les Pierrasques Côt...,—,32,2946
3,Vin Rouge Vins étrangers Chili Carmenere CONCH...,Casillero del Diablo Carmenere (Reserva),—,36,66273


### and give it the proper name

In [14]:
df = df.rename(columns={"index": "CarrefourPrice"})
df

Unnamed: 0,CarrefourPrice,VivinoName,VivinoPrice,Rating,# Reviews
0,Vin Rouge Bordeaux Lussac Saint Emilion Merlot...,Passeport Saint-Emilion Merlot - Cabernet Franc,—,35,3419
1,Vin Rouge Sud Ouest Bergerac Merlot - Cabernet...,La Cave d'Augustin Florent Merlot - Cabernet S...,—,33,84
2,Vin Rouge Vallée du Rhône Côtes du Rhône Grena...,La Cave d'Augustin Florent Les Pierrasques Côt...,—,32,2946
3,Vin Rouge Vins étrangers Chili Carmenere CONCH...,Casillero del Diablo Carmenere (Reserva),—,36,66273


### Save to Excel without the index (0, 1, 2, 3...)
You might need to install openpyxl if you get an error
If the file already exists, you must close it before running the next cell

In [15]:
df.to_excel("CarrefourVivino.xlsx", index=False)