# Scrape

### 1. Setup

The setup serves for:
- Importing libraries.
- Speficying utility functions.
- Setting up the chrome webdriver.
- Defining scraping constants.

In [199]:
# imports

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import time
import pandas as pd
import random
import ast
import numpy as np
import re

In [153]:
# utils

def sleep():
    time.sleep(1 + random.random())

In [274]:
# set up ChromeDriver

options = Options()
# options.add_argument('--headless')  # save GUI rendering time
# options.add_argument('--disable-gpu')  # prevents errors in headless mode

service = Service('/usr/local/bin/chromedriver')
driver = webdriver.Chrome(service=service, options=options)

# open base page
BASE_URL = 'https://www.migros.ch/en'
driver.get(BASE_URL)
time.sleep(5)

In [155]:
# constants

# go to subcategory
SUBCAT_XPATH = '/html/body/app-root/div[1]/lsp-shop/div/div/div/div/ng-component/main/div/ng-component/mo-category-level-1-children-categories/div/ul/li[{}]/div/div/a'

# go to page
TEMPLATE_URL = '?page={}'

# hand picked from base url
CATEGORIES = [
    'fruits-vegetables',
    'bread-pastries-breakfast',
    'pasta-condiments-canned-food',
    'snacks-sweets',
    'frozen-food',
    'drinks-coffee-tea',
    'wine-beer-spirits'
]

### 2. Scrape links

Considerations and steps:
- Since all products have pages have the same template,
it is convenient separating the link retrieval and the product specifics scraping.
- In this first step we navigate to category, sub-category, and scrape all products links from the subcategories subpages.
- All links are saved a dataframe, and then permanently saved to csv. 
- The same dataframe will be used to store the product specifics in the next step.

In [None]:
# list of dictionaries 
data = []

# big giant massive loop
for cat in CATEGORIES:
    url = BASE_URL + '/category/' + cat
    driver.get(url) 

    sleep()

    # dynamically find subcategories (li[n] elements)
    subcategory_urls = []
    li_index = 1

    while True:
        sleep()
        subcat_xpath = SUBCAT_XPATH.format(li_index)
        try:
            subcategory = driver.find_element(By.XPATH, SUBCAT_XPATH)
            subcategory_url = subcategory.get_attribute('href')
            subcategory_urls.append(subcategory_url)

            print(f"Found subcategory link: {subcategory_url}")
            li_index += 1

        except NoSuchElementException:
            print("No more subcategory elements found.")
            break

    print(f"Subcategories found: {subcategory_urls}")

    for subcat_url in subcategory_urls:
        driver.get(subcat_url) 
        sleep()

        # extract products and go to the next page
        scrape_url = subcat_url + TEMPLATE_URL
        print(f"Scraping url: {scrape_url}")
        product_urls = []
        page = 1
        while True:
            sleep()
            url = scrape_url.format(page)
            driver.get(url)
            print(f"Scraping url: {url}")
            
            try:
                product_list = driver.find_elements(By.XPATH, '/html/body/app-root/div[1]/lsp-shop/div/div/div/div/ng-component/main/div/ng-component/div/mo-items-display/div/ul/li/article/div/div[1]/a[1]')
                if not product_list:
                    print(f"No more products found on page {page}. Exiting.")
                    break
                
                for product in product_list:
                    try:
                        product_url = product.get_attribute('href')
                        product_urls.append(product_url)
                    except Exception as e:
                        print(f"Error processing product: {e}")
                        continue
                
                print(f"Finished scraping page {page}.")
                page += 1
            
            except NoSuchElementException:
                print(f"Error on page {page}, stopping.")
                break

        # store
        current_data = {
            'category': cat,
            'sub_category': subcat_url.split('/')[-1],
            'url': product_urls
        }
        data.append(current_data)

        # report 
        print(f"Finished scraping subcategory {subcat_url}")
        print(f"Data: {current_data}")

In [None]:
# convert to dataframe

df = pd.DataFrame(data)
df = df.explode('url')
df.to_csv('../data/product_categorization_and_urls.csv', index=False)

### 3. Scrape product information

In [156]:
# adapt df 
df = pd.read_csv('../data/product_categorization_and_urls.csv')

# start from where we left off
start_index = 0  

In [157]:
# setup loop 

product_data = []
counter = 0
total = len(df)

In [None]:
# big giant massive loop
for index, row in df.iloc[start_index:].iterrows():
    product_url = row['url']
    driver.get(product_url)
    sleep()  # Wait for the page to load

    try:
        # Extract the product name
        product_name = driver.find_element(By.CSS_SELECTOR, 'div.core-product-title h1').text

        # Extract the product price
        try:
            product_price = driver.find_element(By.CSS_SELECTOR, 'span.actual').text
        except Exception:
            product_price = "N/A"
            print(f"Product price not found for {product_name}")

        # Extract the product weight
        try:
            product_weight = driver.find_element(By.CSS_SELECTOR, 'span.weight-priceUnit').text
        except Exception:
            product_weight = "N/A"
            print(f"Product weight not found for {product_name}")

        # Extract the price per unit (e.g., per 100g)
        try:
            price_per_unit = driver.find_element(By.CSS_SELECTOR, 'span.listMode-priceUnit').text
        except Exception:
            price_per_unit = "N/A"
            print(f"Price per unit not found for {product_name}")

    
        ########
        # Initialize dictionary to store nutritional information
        nutrition_info = {}
        try:
            # Extract the full HTML of the table containing nutritional information
            table_html = driver.find_element(By.XPATH, '/html/body/app-root/div[1]/lsp-shop/div/div/div/div/mo-product-detail-container/main/div/mo-product-detail-page/div[1]/div[3]/mo-product-detail-information/mo-tab-layout/div/div/div[2]/mo-product-detail-nutrients-information/table').get_attribute('outerHTML')

            # Use BeautifulSoup to parse the HTML
            soup = BeautifulSoup(table_html, 'html.parser')

            # Find all rows in the table
            rows = soup.find('tbody').find_all('tr')

            # Loop through the rows and extract data
            for row in rows:
                try:
                    # Extract the nutrient name from the first column (td[1])
                    nutrient_name = row.find_all('td')[0].get_text(strip=True)

                    # Extract the nutrient value from the second column (td[2])
                    nutrient_value = row.find_all('td')[1].get_text(strip=True)

                    # Store the nutrient name and value in the dictionary
                    nutrition_info[nutrient_name] = nutrient_value

                except Exception as e:
                    print(f"Error processing row: {e}")
                    continue

        except Exception as e:
            print(f"Error extracting table.")
        ########

        # Store the scraped data in the list
        current_data = {
            'url': product_url,
            'name': product_name,
            'weight': product_weight,
            'price': product_price,
            'price_per_unit': price_per_unit,
            'nutritional_values': nutrition_info
        }
        product_data.append(current_data)

        counter += 1

        print(f"{counter}/{total}: {current_data}")

    except Exception as e:
        print(f"Error processing product at {product_url}: {str(e)}")
        continue

In [None]:
df = pd.DataFrame(product_data)
df.to_csv('../data/product_specifics.csv', index=False)

df.head()

In [7]:
# close the driver
driver.quit()

### 4. Check datasets and merge

In [342]:
df1 = pd.read_csv('../data/product_categorization_and_urls.csv')
df2 = pd.read_csv('../data/product_specifics.csv')

# something went wrong with three links
print(df1.shape)
print(df2.shape)

(9602, 3)
(9602, 6)


In [343]:
# check scraping worked for all links

missing_links = set(df1['url']) - set(df2['url'])
print(missing_links)

set()


In [344]:
# scrape the missing links

missing_links_df = df1[df1['url'].isin(missing_links)]
missing_links_df

Unnamed: 0,category,sub_category,url


In [345]:
# check for duplicates
print(df1['url'].duplicated().sum())
print(df2['url'].duplicated().sum())

# some urls are both in fruit and vegetables for example

57
57


In [346]:
# remove duplicates from df2, since the information is the same
df2 = df2.drop_duplicates(subset=['url'])

In [347]:
# take care of nutritional_values column
df2['nutritional_values'][0]

"{'Energy': '174 kJ (42 kcal)', 'Fat': '0.6 g', 'of which saturates': '0 g', 'Carbohydrate': '6 g', 'of which sugars': '6 g', 'Fibre': '5 g', 'Protein': '0.6 g', 'Salt': '< 0.01 g'}"

In [348]:
# convert the strings to dictionaries
df2['nutritional_values'] = df2['nutritional_values'].apply(ast.literal_eval)

# separate columns
nutrition_df = pd.json_normalize(df2['nutritional_values'])

# merge new columns into df
df2 = df2.join(nutrition_df)

df2.head()

Unnamed: 0,url,name,weight,price,price_per_unit,nutritional_values,Energy,Fat,of which saturates,Carbohydrate,...,Selen,Caffeine,Copper,Silicon,Bicarbonate (or hydrogen carbonate),nitrate,sulfate,Chloride,Sulphate 4+,Minerals
0,https://www.migros.ch/en/product/265680802400,Migros Bio · blueberries,250g,7.9,3.16/100g,"{'Energy': '174 kJ (42 kcal)', 'Fat': '0.6 g',...",174 kJ (42 kcal),0.6 g,0 g,6 g,...,,,,,,,,,,
1,https://www.migros.ch/en/product/264601302100,Dates,200g,5.95,2.98/100g,{},,,,,...,,,,,,,,,,
2,https://www.migros.ch/en/product/264500313200,Avocado,,2.8,2.80/Piece,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',...",643 kJ (156 kcal),14 g,2 g,1.8 g,...,,,,,,,,,,
3,https://www.migros.ch/en/product/264280101000,Bio Fairtrade · Bananas,1 kg,2.95,0.30/100g,"{'Energy': '403 kJ (95 kcal)', 'Fat': '< 0.5 g...",403 kJ (95 kcal),< 0.5 g,0.1 g,21 g,...,,,,,,,,,,
4,https://www.migros.ch/en/product/264580113200,Migros Bio · Avocado,,2.8,2.80/Piece,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',...",643 kJ (156 kcal),14 g,2 g,1.8 g,...,,,,,,,,,,


In [349]:
# replace {} with NaN in nutritional_values
df2['nutritional_values'] = df2['nutritional_values'].apply(lambda x: np.nan if x == {} else x)
df2.head()

Unnamed: 0,url,name,weight,price,price_per_unit,nutritional_values,Energy,Fat,of which saturates,Carbohydrate,...,Selen,Caffeine,Copper,Silicon,Bicarbonate (or hydrogen carbonate),nitrate,sulfate,Chloride,Sulphate 4+,Minerals
0,https://www.migros.ch/en/product/265680802400,Migros Bio · blueberries,250g,7.9,3.16/100g,"{'Energy': '174 kJ (42 kcal)', 'Fat': '0.6 g',...",174 kJ (42 kcal),0.6 g,0 g,6 g,...,,,,,,,,,,
1,https://www.migros.ch/en/product/264601302100,Dates,200g,5.95,2.98/100g,,,,,,...,,,,,,,,,,
2,https://www.migros.ch/en/product/264500313200,Avocado,,2.8,2.80/Piece,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',...",643 kJ (156 kcal),14 g,2 g,1.8 g,...,,,,,,,,,,
3,https://www.migros.ch/en/product/264280101000,Bio Fairtrade · Bananas,1 kg,2.95,0.30/100g,"{'Energy': '403 kJ (95 kcal)', 'Fat': '< 0.5 g...",403 kJ (95 kcal),< 0.5 g,0.1 g,21 g,...,,,,,,,,,,
4,https://www.migros.ch/en/product/264580113200,Migros Bio · Avocado,,2.8,2.80/Piece,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',...",643 kJ (156 kcal),14 g,2 g,1.8 g,...,,,,,,,,,,


In [350]:
# sanity check
print(len(df2.columns))
print(len(df2.columns.unique()))
print(len(df2))

70
70
9545


In [351]:
# merge the two dataframes
df = pd.merge(df1, df2, on='url', how='inner')
len(df)

9602

In [352]:
# drop duplicates
df_no_duplicates = df.drop_duplicates(subset='url')
print("Duplicates: ", len(df) - len(df_no_duplicates))

Duplicates:  57


In [353]:
# duplicates come from the fact that some products are in multiple categories
# keep them 

In [354]:
df['weight'].unique()

array(['250g', '200g', nan, '1 kg', '500g', '3 Stück', '1kg', '1 Stück',
       '750g', '300g', '50g', '3kg', '100g', '2kg', '240g', '400g',
       '100 g', '1 Kilo', '1.5kg', '700g', '2000g', '1500g', '1 Stk.',
       '850g', '600g', '2.5kg', '350g', '150g', '440g', '330g', '160g',
       '1000g', '40g', '2 Stück', '475g', '75g', '800g', '125g', '480g',
       '5kg', '500ml', '210g', '130g', '1000ml', '2 x 500ml', '60ml',
       '120g', '80g', '270ml', '180g', '230g', '1l', '220g', '320g',
       '140g', '2 x 130g', '260g', '375g', '30g', '10g', '35g', '20g',
       '25g', '380g', '460g', '370g', '540g', '4dl', '470g', '410g',
       '2.5dl', '3200g', '3100g', '360g', '650g', '45g', '105g', '135g',
       '270g', '60g', '90g', '110g', '70g', '85g', '65g', '93g', '114g',
       '215g', '68g', '2 x 146g', '95g', '3 x 80g', '82g', '2x100g',
       '46g', '285g', '138g', '58g', '53g', '115g', '280g', '170g', '71g',
       '2 x 100g', '4 x 80g', '420g', '450g', '55g', '275g', '83g',
      

In [355]:
"""
Cases to cover:
'1300 Tabl.'
'nan'
'1 piece'
"Crème d'Or C"
'330ML'
'6x50cl'
'1dl'
'3 Stk.'
'3 Stück'
'3 x 90g'
'2x80g'
'6 x 12.5g'
'18 Balls'
'170G'
'1 Kilo'
"""
# Function to parse the weight column
def parse_weight(weight):
    if pd.isna(weight):
        return pd.Series([np.nan, np.nan, np.nan], index=['quantity', 'weight_unit', 'weight_per_unit'])
    
    weight = weight.strip().lower()
    
    # Match patterns like '<number> x <number><unit>'
    match = re.match(r'^(\d+)\s*x\s*(\d+(\.\d+)?)([a-z]+)$', weight)
    if match:
        quantity = int(match.group(1))
        weight_per_unit = float(match.group(2))
        weight_unit = match.group(4)
        if weight_unit in ['kg', 'kilo']:
            weight_unit = 'g'
            weight_per_unit *= 1000
        elif weight_unit == 'l':
            weight_unit = 'ml'
            weight_per_unit *= 1000
        elif weight_unit == 'dl':
            weight_unit = 'ml'
            weight_per_unit *= 100
        elif weight_unit == 'cl':
            weight_unit = 'ml'
            weight_per_unit *= 10
        return pd.Series([quantity, weight_unit, weight_per_unit], index=['quantity', 'weight_unit', 'weight_per_unit'])
    
    # Match patterns like '<number> pieces/stk/balls'
    match = re.match(r'^(\d+)\s*(stück|stk|pieces|balls)$', weight)
    if match:
        quantity = int(match.group(1))
        unit = match.group(2)
        if unit in ['stück', 'stk', 'pieces']:
            weight_unit = 'pieces'
        elif unit == 'balls':
            weight_unit = 'balls'
        return pd.Series([quantity, weight_unit, np.nan], index=['quantity', 'weight_unit', 'weight_per_unit'])
    
    # Match patterns like '<number> <unit>' (e.g., '1 kg', '330ml')
    match = re.match(r'^(\d+(\.\d+)?)\s*([a-z]+)$', weight)
    if match:
        quantity = 1
        value = float(match.group(1))
        unit = match.group(3)
        if unit in ['kg', 'kilo']:
            weight_unit = 'g'
            weight_per_unit = value * 1000
        elif unit == 'g':
            weight_unit = 'g'
            weight_per_unit = value
        elif unit in ['ml', 'l']:
            weight_unit = 'ml'
            weight_per_unit = value * 1000 if unit == 'l' else value
        elif unit == 'cl':
            weight_unit = 'ml'
            weight_per_unit = value * 10
        elif unit == 'dl':
            weight_unit = 'ml'
            weight_per_unit = value * 100
        elif unit == 'piece':
            weight_unit = 'piece'
            weight_per_unit = np.nan
        elif unit == 'tabl':
            weight_unit = 'tabl'
            weight_per_unit = np.nan
        return pd.Series([quantity, weight_unit, weight_per_unit], index=['quantity', 'weight_unit', 'weight_per_unit'])
    
    # Default case for non-numeric information
    return pd.Series([np.nan, np.nan, np.nan], index=['quantity', 'weight_unit', 'weight_per_unit'])

# Apply the function to the DataFrame
df[['quantity', 'weight_unit', 'weight_per_unit']] = df['weight'].apply(parse_weight)

In [356]:
df.head()

Unnamed: 0,category,sub_category,url,name,weight,price,price_per_unit,nutritional_values,Energy,Fat,...,Silicon,Bicarbonate (or hydrogen carbonate),nitrate,sulfate,Chloride,Sulphate 4+,Minerals,quantity,weight_unit,weight_per_unit
0,fruits-vegetables,fruits,https://www.migros.ch/en/product/265680802400,Migros Bio · blueberries,250g,7.9,3.16/100g,"{'Energy': '174 kJ (42 kcal)', 'Fat': '0.6 g',...",174 kJ (42 kcal),0.6 g,...,,,,,,,,1.0,g,250.0
1,fruits-vegetables,fruits,https://www.migros.ch/en/product/264601302100,Dates,200g,5.95,2.98/100g,,,,...,,,,,,,,1.0,g,200.0
2,fruits-vegetables,fruits,https://www.migros.ch/en/product/264500313200,Avocado,,2.8,2.80/Piece,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',...",643 kJ (156 kcal),14 g,...,,,,,,,,,,
3,fruits-vegetables,fruits,https://www.migros.ch/en/product/264280101000,Bio Fairtrade · Bananas,1 kg,2.95,0.30/100g,"{'Energy': '403 kJ (95 kcal)', 'Fat': '< 0.5 g...",403 kJ (95 kcal),< 0.5 g,...,,,,,,,,1.0,g,1000.0
4,fruits-vegetables,fruits,https://www.migros.ch/en/product/264580113200,Migros Bio · Avocado,,2.8,2.80/Piece,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',...",643 kJ (156 kcal),14 g,...,,,,,,,,,,


In [357]:
df["quantity"].unique()

array([ 1., nan,  3.,  2.,  4.,  6.,  5., 10.,  8., 50., 20.,  7., 12.,
        9., 24., 18.])

In [358]:
df["weight_unit"].unique()

array(['g', nan, 'pieces', 'ml', 'piece', 'balls'], dtype=object)

In [359]:
df["weight_per_unit"].unique()

array([2.500e+02, 2.000e+02,       nan, 1.000e+03, 5.000e+02, 7.500e+02,
       3.000e+02, 5.000e+01, 3.000e+03, 1.000e+02, 2.000e+03, 2.400e+02,
       4.000e+02, 1.500e+03, 7.000e+02, 8.500e+02, 6.000e+02, 2.500e+03,
       3.500e+02, 1.500e+02, 4.400e+02, 3.300e+02, 1.600e+02, 4.000e+01,
       4.750e+02, 7.500e+01, 8.000e+02, 1.250e+02, 4.800e+02, 5.000e+03,
       2.100e+02, 1.300e+02, 6.000e+01, 1.200e+02, 8.000e+01, 2.700e+02,
       1.800e+02, 2.300e+02, 2.200e+02, 3.200e+02, 1.400e+02, 2.600e+02,
       3.750e+02, 3.000e+01, 1.000e+01, 3.500e+01, 2.000e+01, 2.500e+01,
       3.800e+02, 4.600e+02, 3.700e+02, 5.400e+02, 4.700e+02, 4.100e+02,
       3.200e+03, 3.100e+03, 3.600e+02, 6.500e+02, 4.500e+01, 1.050e+02,
       1.350e+02, 9.000e+01, 1.100e+02, 7.000e+01, 8.500e+01, 6.500e+01,
       9.300e+01, 1.140e+02, 2.150e+02, 6.800e+01, 1.460e+02, 9.500e+01,
       8.200e+01, 4.600e+01, 2.850e+02, 1.380e+02, 5.800e+01, 5.300e+01,
       1.150e+02, 2.800e+02, 1.700e+02, 7.100e+01, 

In [360]:
# remove the weight column
df = df.drop(columns=['weight'])

In [361]:
# parse price column to numeric
print(df['price'].unique())

# # remove dash '-' at end of string if present
df['price'] = df['price'].str.replace('–', '')
print(df['price'].unique())

# parse price column to numeric now that all values are numeric
df['price'] = df['price'].astype(float)
df['price'].unique()

['7.90' '5.95' '2.80' '2.95' '6.30' '2.50' '3.95' '8.90' '3.70' '3.80'
 '4.50' '3.30' '3.50' '0.95' '5.50' '4.80' '4.95' '3.75' '6.95' '2.20'
 '3.20' '4.70' '2.70' '3.10' '7.30' '5.70' '4.30' '2.60' '5.60' '5.40'
 '2.90' '4.20' '1.40' '9.50' '7.50' '6.50' '5.80' '6.70' '6.90' '5.30'
 '1.85' '5.90' '5.85' '1.–' '1.50' '0.60' '5.–' '1.90' '3.–' '1.80' '9.60'
 '0.90' '2.30' '3.60' '0.50' '4.40' '6.40' '3.90' '1.20' '4.90' '4.65'
 '2.75' '1.30' '0.55' '9.90' '6.25' '11.30' '2.55' '9.95' '2.–' '3.40'
 '8.95' '2.40' '2.85' '4.55' '1.65' '1.10' '5.20' '8.50' '8.80' '7.80'
 '9.40' '4.60' '5.65' '7.95' '2.25' '2.15' '11.95' '1.60' '4.10' '4.15'
 '1.95' '3.65' '2.65' '6.60' '2.10' '7.40' '0.70' '3.85' '6.20' '5.75'
 '3.55' '3.05' '3.35' '4.45' '1.70' '11.40' nan '12.95' '11.50' '14.30'
 '11.20' '0.75' '5.10' '4.–' '1.25' '0.85' '6.–' '0.80' '15.–' '3.25'
 '4.05' '6.10' '4.25' '3.15' '1.15' '15.90' '3.45' '1.75' '1.35' '6.80'
 '14.90' '18.60' '9.20' '8.20' '12.90' '32.–' '13.50' '30.–' '8.40'
 '1

array([  7.9 ,   5.95,   2.8 ,   2.95,   6.3 ,   2.5 ,   3.95,   8.9 ,
         3.7 ,   3.8 ,   4.5 ,   3.3 ,   3.5 ,   0.95,   5.5 ,   4.8 ,
         4.95,   3.75,   6.95,   2.2 ,   3.2 ,   4.7 ,   2.7 ,   3.1 ,
         7.3 ,   5.7 ,   4.3 ,   2.6 ,   5.6 ,   5.4 ,   2.9 ,   4.2 ,
         1.4 ,   9.5 ,   7.5 ,   6.5 ,   5.8 ,   6.7 ,   6.9 ,   5.3 ,
         1.85,   5.9 ,   5.85,   1.  ,   1.5 ,   0.6 ,   5.  ,   1.9 ,
         3.  ,   1.8 ,   9.6 ,   0.9 ,   2.3 ,   3.6 ,   0.5 ,   4.4 ,
         6.4 ,   3.9 ,   1.2 ,   4.9 ,   4.65,   2.75,   1.3 ,   0.55,
         9.9 ,   6.25,  11.3 ,   2.55,   9.95,   2.  ,   3.4 ,   8.95,
         2.4 ,   2.85,   4.55,   1.65,   1.1 ,   5.2 ,   8.5 ,   8.8 ,
         7.8 ,   9.4 ,   4.6 ,   5.65,   7.95,   2.25,   2.15,  11.95,
         1.6 ,   4.1 ,   4.15,   1.95,   3.65,   2.65,   6.6 ,   2.1 ,
         7.4 ,   0.7 ,   3.85,   6.2 ,   5.75,   3.55,   3.05,   3.35,
         4.45,   1.7 ,  11.4 ,    nan,  12.95,  11.5 ,  14.3 ,  11.2 ,
      

In [362]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9602 entries, 0 to 9601
Data columns (total 74 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   category                                    9602 non-null   object 
 1   sub_category                                9602 non-null   object 
 2   url                                         9602 non-null   object 
 3   name                                        9602 non-null   object 
 4   price                                       9569 non-null   float64
 5   price_per_unit                              9508 non-null   object 
 6   nutritional_values                          7826 non-null   object 
 7   Energy                                      7690 non-null   object 
 8   Fat                                         1981 non-null   object 
 9   of which saturates                          1819 non-null   object 
 10  Carbohydrate

In [363]:
# print all of them to see what we are working with
for up in df['price_per_unit'].unique():
    print(up)

# split on "/"
df[['price_per_unit', 'price_unit']] = df['price_per_unit'].str.split('/', expand=True)
print(df['price_unit'].unique())
print(df['price_per_unit'].unique())

3.16/100g
2.98/100g
2.80/Piece
0.30/100g
0.63/100g
0.50/100g
1.58/100g
0.83/piece
1.78/100g
0.25/100g
5.95/piece
0.74/100g
3.95/piece
0.76/100g
0.89/100g
0.90/100g
0.22/100g
1.–/100g
5.95/Piece
0.70/100g
0.95/piece
0.55/100g
2.80/piece
0.37/100g
0.96/100g
1.65/100g
0.75/100g
1.48/100g
1.80/100g
0.35/100g
4.40/100g
0.66/100g
2.78/100g
0.15/100g
3.20/piece
0.94/100g
0.54/100g
0.31/100g
7.30/100g
2.20/100g
0.29/100g
4.30/piece
2.60/piece
2.24/100g
0.36/100g
1.21/100g
0.84/100g
1.40/100g
0.95/100g
3.–/100g
2.60/100g
1.16/100g
2.38/100g
1.68/100g
2.30/100g
2.12/100g
1.92/100g
1.85/100g
0.99/100g
0.59/100g
1.–/Piece
1.50/100g
2.80/100g
0.60/piece
3.20/Piece
0.38/100g
0.60/100g
1.83/100g
3.60/100g
1.–/piece
2.60/Piece
0.60/Piece
3.80/piece
1.18/100g
0.90/piece
0.87/piece
1.50/Piece
1.50/piece
0.46/100g
3.60/piece
0.42/100g
4.50/Piece
0.40/100g
0.50/Piece
3.95/Piece
5.60/Piece
2.20/piece
3.10/piece
0.33/100g
4.40/Piece
5.80/100g
0.69/100g
0.64/100g
0.39/100g
1.20/piece
1.20/Piece
4.40/piece
0.

In [364]:
df['price_per_unit'].unique()

array(['3.16', '2.98', '2.80', '0.30', '0.63', '0.50', '1.58', '0.83',
       '1.78', '0.25', '5.95', '0.74', '3.95', '0.76', '0.89', '0.90',
       '0.22', '1.–', '0.70', '0.95', '0.55', '0.37', '0.96', '1.65',
       '0.75', '1.48', '1.80', '0.35', '4.40', '0.66', '2.78', '0.15',
       '3.20', '0.94', '0.54', '0.31', '7.30', '2.20', '0.29', '4.30',
       '2.60', '2.24', '0.36', '1.21', '0.84', '1.40', '3.–', '1.16',
       '2.38', '1.68', '2.30', '2.12', '1.92', '1.85', '0.99', '0.59',
       '1.50', '0.60', '0.38', '1.83', '3.60', '3.80', '1.18', '0.87',
       '0.46', '0.42', '4.50', '0.40', '5.60', '3.10', '0.33', '5.80',
       '0.69', '0.64', '0.39', '1.20', '0.49', '0.47', '0.71', '2.95',
       '0.20', '0.32', '0.43', '0.28', '1.49', '0.13', '1.07', '0.45',
       '1.25', '2.50', '4.95', '1.30', '0.23', '0.16', '2.55', '9.95',
       '2.–', '0.88', '0.79', '1.23', '0.34', '1.19', '0.57', '1.37',
       '1.17', '2.17', '1.90', '1.02', '3.03', '0.48', '0.56', '1.10',
       '0

In [365]:
# remove dash '-' at end of string if present
df['price_per_unit'] = df['price_per_unit'].str.replace('–', '0')
print(df['price_per_unit'].unique())

['3.16' '2.98' '2.80' '0.30' '0.63' '0.50' '1.58' '0.83' '1.78' '0.25'
 '5.95' '0.74' '3.95' '0.76' '0.89' '0.90' '0.22' '1.0' '0.70' '0.95'
 '0.55' '0.37' '0.96' '1.65' '0.75' '1.48' '1.80' '0.35' '4.40' '0.66'
 '2.78' '0.15' '3.20' '0.94' '0.54' '0.31' '7.30' '2.20' '0.29' '4.30'
 '2.60' '2.24' '0.36' '1.21' '0.84' '1.40' '3.0' '1.16' '2.38' '1.68'
 '2.30' '2.12' '1.92' '1.85' '0.99' '0.59' '1.50' '0.60' '0.38' '1.83'
 '3.60' '3.80' '1.18' '0.87' '0.46' '0.42' '4.50' '0.40' '5.60' '3.10'
 '0.33' '5.80' '0.69' '0.64' '0.39' '1.20' '0.49' '0.47' '0.71' '2.95'
 '0.20' '0.32' '0.43' '0.28' '1.49' '0.13' '1.07' '0.45' '1.25' '2.50'
 '4.95' '1.30' '0.23' '0.16' '2.55' '9.95' '2.0' '0.88' '0.79' '1.23'
 '0.34' '1.19' '0.57' '1.37' '1.17' '2.17' '1.90' '1.02' '3.03' '0.48'
 '0.56' '1.10' '0.98' '0.85' '1.88' '1.69' '0.78' '0.52' '4.60' '1.81'
 '0.44' '0.80' '0.53' '2.23' '2.25' nan '3.47' '2.40' '3.75' '5.30' '1.08'
 '14.75' '5.98' '2.83' '3.25' '1.60' '1.96' '0.41' '2.45' '0.26' '0.12'
 '1.

In [366]:
# parse to float
df['price_per_unit'] = df['price_per_unit'].astype(float)
print(df['price_per_unit'].unique())

# sort by price_per_unit and print first 5
df.sort_values(by='price_per_unit', ascending=False).head(20)

[3.1600e+00 2.9800e+00 2.8000e+00 3.0000e-01 6.3000e-01 5.0000e-01
 1.5800e+00 8.3000e-01 1.7800e+00 2.5000e-01 5.9500e+00 7.4000e-01
 3.9500e+00 7.6000e-01 8.9000e-01 9.0000e-01 2.2000e-01 1.0000e+00
 7.0000e-01 9.5000e-01 5.5000e-01 3.7000e-01 9.6000e-01 1.6500e+00
 7.5000e-01 1.4800e+00 1.8000e+00 3.5000e-01 4.4000e+00 6.6000e-01
 2.7800e+00 1.5000e-01 3.2000e+00 9.4000e-01 5.4000e-01 3.1000e-01
 7.3000e+00 2.2000e+00 2.9000e-01 4.3000e+00 2.6000e+00 2.2400e+00
 3.6000e-01 1.2100e+00 8.4000e-01 1.4000e+00 3.0000e+00 1.1600e+00
 2.3800e+00 1.6800e+00 2.3000e+00 2.1200e+00 1.9200e+00 1.8500e+00
 9.9000e-01 5.9000e-01 1.5000e+00 6.0000e-01 3.8000e-01 1.8300e+00
 3.6000e+00 3.8000e+00 1.1800e+00 8.7000e-01 4.6000e-01 4.2000e-01
 4.5000e+00 4.0000e-01 5.6000e+00 3.1000e+00 3.3000e-01 5.8000e+00
 6.9000e-01 6.4000e-01 3.9000e-01 1.2000e+00 4.9000e-01 4.7000e-01
 7.1000e-01 2.9500e+00 2.0000e-01 3.2000e-01 4.3000e-01 2.8000e-01
 1.4900e+00 1.3000e-01 1.0700e+00 4.5000e-01 1.2500e+00 2.5000

Unnamed: 0,category,sub_category,url,name,price,price_per_unit,nutritional_values,Energy,Fat,of which saturates,...,Bicarbonate (or hydrogen carbonate),nitrate,sulfate,Chloride,Sulphate 4+,Minerals,quantity,weight_unit,weight_per_unit,price_unit
3758,pasta-condiments-canned-food,spices-sauces,https://www.migros.ch/en/product/103389300000,"Migros Bio · hemp oil, cold pressed",8.95,3580.0,"{'Energy': '3404 kJ (828 kcal)', 'Fathigh': '9...",,,,...,,,,,,,1.0,ml,250.0,100g
4034,pasta-condiments-canned-food,spices-sauces,https://www.migros.ch/en/product/103389400000,Da Emilio · Virgin olive oil · al tartufo bianco,7.2,2880.0,"{'Energy': '3404 kJ (828 kcal)', 'Fathigh': '9...",,,,...,,,,,,,1.0,ml,250.0,100g
4138,pasta-condiments-canned-food,spices-sauces,https://www.migros.ch/en/product/106206800000,Bio Safran Leprotto,5.4,1800.0,"{'Energy': '1487 kJ (354 kcal)', 'Fatmedium': ...",1307 kJ (312 kcal),4.78 g,0.7 g,...,,,,,,,2.0,g,0.15,100g
4222,pasta-condiments-canned-food,spices-sauces,https://www.migros.ch/en/product/106206400000,Sélection · filaments of saffron,5.5,1571.4,,~ 771 kJ (~ 186 kcal),,,...,,,,,,,1.0,g,0.35,100g
3637,pasta-condiments-canned-food,spices-sauces,https://www.migros.ch/en/product/106211800000,Migros Bio · Saffron · ground,4.3,860.0,,~ 1004 kJ (~ 242 kcal),,,...,,,,,,,4.0,g,0.12,100g
3561,pasta-condiments-canned-food,spices-sauces,https://www.migros.ch/en/product/106204400000,M-Classic · saffron · 4 sachets of 125mg,3.3,660.0,"{'Energy': '1413 kJ (336 kcal)', 'Fat': '4.4 g...",2695 kJ (655 kcal),,,...,,,,,,,4.0,g,0.12,100g
4202,pasta-condiments-canned-food,spices-sauces,https://www.migros.ch/en/product/106232600000,Swiss Alpine Herbs Bio Alpenblüten,6.9,230.0,,~ 420 kJ (~ 101 kcal),~ 6 g,~ 1.9 g,...,,,,,,,1.0,g,3.0,100g
4171,pasta-condiments-canned-food,spices-sauces,https://www.migros.ch/en/product/106294900000,GSB Bio Trio für Käse,9.9,230.0,,~ 907 kJ (~ 217 kcal),,,...,,,,,,,1.0,g,120.0,100g
2430,bread-pastries-breakfast,baking-ingredients,https://www.migros.ch/en/product/102592000000,Alnatura · bourbon vanilla · ground,5.9,118.0,,1700 kJ (400 kcal),,,...,,,,,,,1.0,g,5.0,100g
2429,bread-pastries-breakfast,baking-ingredients,https://www.migros.ch/en/product/102795100000,Alnatura · Gousses de vanille,6.3,105.0,,2547 kJ (616 kcal),53 g,4 g,...,,,,,,,1.0,piece,,100g


In [367]:
# the anomalies are already present from the scraping, although the website displays the price per unit correctly

In [368]:




# Change cols with "piece" to "Piece"
df['price_unit'] = df['price_unit'].str.replace('piece', 'Piece')
print(df['price_unit'].unique())

['100g' 'Piece' nan '100ml']


In [369]:
print(df['price_per_unit'].unique())

[3.1600e+00 2.9800e+00 2.8000e+00 3.0000e-01 6.3000e-01 5.0000e-01
 1.5800e+00 8.3000e-01 1.7800e+00 2.5000e-01 5.9500e+00 7.4000e-01
 3.9500e+00 7.6000e-01 8.9000e-01 9.0000e-01 2.2000e-01 1.0000e+00
 7.0000e-01 9.5000e-01 5.5000e-01 3.7000e-01 9.6000e-01 1.6500e+00
 7.5000e-01 1.4800e+00 1.8000e+00 3.5000e-01 4.4000e+00 6.6000e-01
 2.7800e+00 1.5000e-01 3.2000e+00 9.4000e-01 5.4000e-01 3.1000e-01
 7.3000e+00 2.2000e+00 2.9000e-01 4.3000e+00 2.6000e+00 2.2400e+00
 3.6000e-01 1.2100e+00 8.4000e-01 1.4000e+00 3.0000e+00 1.1600e+00
 2.3800e+00 1.6800e+00 2.3000e+00 2.1200e+00 1.9200e+00 1.8500e+00
 9.9000e-01 5.9000e-01 1.5000e+00 6.0000e-01 3.8000e-01 1.8300e+00
 3.6000e+00 3.8000e+00 1.1800e+00 8.7000e-01 4.6000e-01 4.2000e-01
 4.5000e+00 4.0000e-01 5.6000e+00 3.1000e+00 3.3000e-01 5.8000e+00
 6.9000e-01 6.4000e-01 3.9000e-01 1.2000e+00 4.9000e-01 4.7000e-01
 7.1000e-01 2.9500e+00 2.0000e-01 3.2000e-01 4.3000e-01 2.8000e-01
 1.4900e+00 1.3000e-01 1.0700e+00 4.5000e-01 1.2500e+00 2.5000

In [370]:
df.head()

Unnamed: 0,category,sub_category,url,name,price,price_per_unit,nutritional_values,Energy,Fat,of which saturates,...,Bicarbonate (or hydrogen carbonate),nitrate,sulfate,Chloride,Sulphate 4+,Minerals,quantity,weight_unit,weight_per_unit,price_unit
0,fruits-vegetables,fruits,https://www.migros.ch/en/product/265680802400,Migros Bio · blueberries,7.9,3.16,"{'Energy': '174 kJ (42 kcal)', 'Fat': '0.6 g',...",174 kJ (42 kcal),0.6 g,0 g,...,,,,,,,1.0,g,250.0,100g
1,fruits-vegetables,fruits,https://www.migros.ch/en/product/264601302100,Dates,5.95,2.98,,,,,...,,,,,,,1.0,g,200.0,100g
2,fruits-vegetables,fruits,https://www.migros.ch/en/product/264500313200,Avocado,2.8,2.8,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',...",643 kJ (156 kcal),14 g,2 g,...,,,,,,,,,,Piece
3,fruits-vegetables,fruits,https://www.migros.ch/en/product/264280101000,Bio Fairtrade · Bananas,2.95,0.3,"{'Energy': '403 kJ (95 kcal)', 'Fat': '< 0.5 g...",403 kJ (95 kcal),< 0.5 g,0.1 g,...,,,,,,,1.0,g,1000.0,100g
4,fruits-vegetables,fruits,https://www.migros.ch/en/product/264580113200,Migros Bio · Avocado,2.8,2.8,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',...",643 kJ (156 kcal),14 g,2 g,...,,,,,,,,,,Piece


In [371]:
# Energy column

# Function to split the Energy column into energy_kJ and energy_kcal
def split_energy(energy):
    if pd.isna(energy) or energy.strip() == '':
        return np.nan, np.nan
    try:
        # Extract the kJ value (before the first space)
        energy_kJ = energy.split(' ')[0]
        # Extract the kcal value (inside the parentheses)
        energy_kcal = energy.split('(')[1].replace('kcal)', '').strip()
        return float(energy_kJ), float(energy_kcal)
    except (IndexError, ValueError):
        return np.nan, np.nan

# Apply the function to split the Energy column
df[['energy_kJ', 'energy_kcal']] = df['Energy'].apply(lambda x: pd.Series(split_energy(x)))

# Drop the original Energy column
df.drop(columns=['Energy'], inplace=True)

# Display the result
df.head()


Unnamed: 0,category,sub_category,url,name,price,price_per_unit,nutritional_values,Fat,of which saturates,Carbohydrate,...,sulfate,Chloride,Sulphate 4+,Minerals,quantity,weight_unit,weight_per_unit,price_unit,energy_kJ,energy_kcal
0,fruits-vegetables,fruits,https://www.migros.ch/en/product/265680802400,Migros Bio · blueberries,7.9,3.16,"{'Energy': '174 kJ (42 kcal)', 'Fat': '0.6 g',...",0.6 g,0 g,6 g,...,,,,,1.0,g,250.0,100g,174.0,42.0
1,fruits-vegetables,fruits,https://www.migros.ch/en/product/264601302100,Dates,5.95,2.98,,,,,...,,,,,1.0,g,200.0,100g,,
2,fruits-vegetables,fruits,https://www.migros.ch/en/product/264500313200,Avocado,2.8,2.8,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',...",14 g,2 g,1.8 g,...,,,,,,,,Piece,643.0,156.0
3,fruits-vegetables,fruits,https://www.migros.ch/en/product/264280101000,Bio Fairtrade · Bananas,2.95,0.3,"{'Energy': '403 kJ (95 kcal)', 'Fat': '< 0.5 g...",< 0.5 g,0.1 g,21 g,...,,,,,1.0,g,1000.0,100g,403.0,95.0
4,fruits-vegetables,fruits,https://www.migros.ch/en/product/264580113200,Migros Bio · Avocado,2.8,2.8,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',...",14 g,2 g,1.8 g,...,,,,,,,,Piece,643.0,156.0


In [372]:
cols_to_parse = ['Fat', 'of which saturates',
       'Carbohydrate', 'of which sugars', 'Fibre', 'Protein', 'Salt', 'Fatlow',
       'of which saturateslow', 'of which sugarsmedium', 'Saltlow',
       'of which sugarshigh', 'Saltmedium', 'of which sugarslow', 'Salthigh',
       'of which mono-unsaturates', 'Fathigh', 'of which saturatesmedium',
       'Fatmedium', 'of which saturateshigh', 'Sodium', 'Energy value in kJ',
       'Energy value in kcal', 'Vitamin B1 (thiamin)',
       'Vitamin B2 (riboflavin)', 'Niacin', 'Vitamin B6',
       'folic acid / folate', 'Iron', 'Magnesium',
       'of which polyvalents alcohols', 'Phosphorus', 'Mangan', 'Vitamin B12',
       'Vitamin D', 'Pantothenic acid', 'Calcium', 'folic acid', 'Vitamin E',
       'Vitamin K', 'Vitamin C', 'Biotin', 'Omega-3 fatty acids', 'Potassium',
       'Zinc', 'Alpha linolenic acid (omega-3 fatty acids)', 'Water',
       'Omega-6 fatty acids', 'Iodine', 'Fluoride', 'of which starch',
       'Vitamin A', 'Cholesterin', 'Selen', 'Caffeine', 'Copper', 'Silicon',
       'Bicarbonate (or hydrogen carbonate)', 'nitrate', 'sulfate', 'Chloride',
       'Sulphate 4+', 'Minerals']

In [373]:
# Function to parse the weight values
def parse_value(value):
    if pd.isna(value) or value.strip() == '':
        return np.nan
    try:
        # Remove leading '~', '<' and other characters that aren't part of the numeric value
        value = value.replace('~', '').replace('<', '').strip()
        # Extract the numeric part
        numeric_value = ''.join([char for char in value if char.isdigit() or char == '.' or char == ','])
        # Convert to float (handle European comma notation)
        return float(numeric_value.replace(',', '.'))
    except (ValueError, TypeError):
        return np.nan

# Apply the function to parse the numeric part of the values
for col in cols_to_parse:
    df[col] = df[col].apply(parse_value)
    df.rename(columns={col: col + ' (g)'}, inplace=True)

# Display the result
df.head()

Unnamed: 0,category,sub_category,url,name,price,price_per_unit,nutritional_values,Fat (g),of which saturates (g),Carbohydrate (g),...,sulfate (g),Chloride (g),Sulphate 4+ (g),Minerals (g),quantity,weight_unit,weight_per_unit,price_unit,energy_kJ,energy_kcal
0,fruits-vegetables,fruits,https://www.migros.ch/en/product/265680802400,Migros Bio · blueberries,7.9,3.16,"{'Energy': '174 kJ (42 kcal)', 'Fat': '0.6 g',...",0.6,0.0,6.0,...,,,,,1.0,g,250.0,100g,174.0,42.0
1,fruits-vegetables,fruits,https://www.migros.ch/en/product/264601302100,Dates,5.95,2.98,,,,,...,,,,,1.0,g,200.0,100g,,
2,fruits-vegetables,fruits,https://www.migros.ch/en/product/264500313200,Avocado,2.8,2.8,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',...",14.0,2.0,1.8,...,,,,,,,,Piece,643.0,156.0
3,fruits-vegetables,fruits,https://www.migros.ch/en/product/264280101000,Bio Fairtrade · Bananas,2.95,0.3,"{'Energy': '403 kJ (95 kcal)', 'Fat': '< 0.5 g...",0.5,0.1,21.0,...,,,,,1.0,g,1000.0,100g,403.0,95.0
4,fruits-vegetables,fruits,https://www.migros.ch/en/product/264580113200,Migros Bio · Avocado,2.8,2.8,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',...",14.0,2.0,1.8,...,,,,,,,,Piece,643.0,156.0


In [374]:
# convert cols to strings
df['category'] = df['category'].astype(str)
df['sub_category'] = df['sub_category'].astype(str)
df['url'] = df['url'].astype(str)
df['name'] = df['name'].astype(str)
df['price_unit'] = df['price_unit'].astype(str)
df['weight_unit'] = df['weight_unit'].astype(str)

In [375]:

# Display the result
df.head()

Unnamed: 0,category,sub_category,url,name,price,price_per_unit,nutritional_values,Fat (g),of which saturates (g),Carbohydrate (g),...,sulfate (g),Chloride (g),Sulphate 4+ (g),Minerals (g),quantity,weight_unit,weight_per_unit,price_unit,energy_kJ,energy_kcal
0,fruits-vegetables,fruits,https://www.migros.ch/en/product/265680802400,Migros Bio · blueberries,7.9,3.16,"{'Energy': '174 kJ (42 kcal)', 'Fat': '0.6 g',...",0.6,0.0,6.0,...,,,,,1.0,g,250.0,100g,174.0,42.0
1,fruits-vegetables,fruits,https://www.migros.ch/en/product/264601302100,Dates,5.95,2.98,,,,,...,,,,,1.0,g,200.0,100g,,
2,fruits-vegetables,fruits,https://www.migros.ch/en/product/264500313200,Avocado,2.8,2.8,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',...",14.0,2.0,1.8,...,,,,,,,,Piece,643.0,156.0
3,fruits-vegetables,fruits,https://www.migros.ch/en/product/264280101000,Bio Fairtrade · Bananas,2.95,0.3,"{'Energy': '403 kJ (95 kcal)', 'Fat': '< 0.5 g...",0.5,0.1,21.0,...,,,,,1.0,g,1000.0,100g,403.0,95.0
4,fruits-vegetables,fruits,https://www.migros.ch/en/product/264580113200,Migros Bio · Avocado,2.8,2.8,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',...",14.0,2.0,1.8,...,,,,,,,,Piece,643.0,156.0


In [376]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9602 entries, 0 to 9601
Data columns (total 76 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   category                                        9602 non-null   object 
 1   sub_category                                    9602 non-null   object 
 2   url                                             9602 non-null   object 
 3   name                                            9602 non-null   object 
 4   price                                           9569 non-null   float64
 5   price_per_unit                                  9508 non-null   float64
 6   nutritional_values                              7826 non-null   object 
 7   Fat (g)                                         1969 non-null   float64
 8   of which saturates (g)                          1807 non-null   float64
 9   Carbohydrate (g)                         

In [377]:
# reorder columns: name, url, category, sub_category, total_price, price_per_unit, price_unit, quantity, weight_per_unit, weight_unit, energy_kJ, energy_kcal, Fat (g), of which saturates (g), Carbohydrate (g), of which sugars (g), Fibre (g), Protein (g), Salt (g), Fatlow (g), of which saturateslow (g), of which sugarsmedium (g), Saltlow (g), of which sugarshigh (g), Saltmedium (g), of which sugarslow (g), Salthigh (g), of which mono-unsaturates (g), Fathigh (g), of which saturatesmedium (g), Fatmedium (g), of which saturateshigh (g), Sodium (g), Energy value in kJ (g), Energy value in kcal (g), Vitamin B1 (thiamin) (g), Vitamin B2 (riboflavin) (g), Niacin (g), Vitamin B6 (g), folic acid / folate (g), Iron (g), Magnesium (g), of which polyvalents alcohols (g), Phosphorus (g), Mangan (g), Vitamin B12 (g), Vitamin D (g), Pantothenic acid (g), Calcium (g), folic acid (g), Vitamin E (g), Vitamin K (g), Vitamin C (g), Biotin (g), Omega-3 fatty acids (g), Potassium (g), Zinc (g), Alpha linolenic acid (omega-3 fatty acids) (g), Water (g), Omega-6 fatty acids (g), Iodine (g), Fluoride (g), of which starch (g), Vitamin A (g), Cholesterin (g), Selen (g), Caffeine (g), Copper (g), Silicon (g), Bicarbonate (or hydrogen carbonate) (g), nitrate (g), sulfate (g), Chloride (g), Sulphate 4+ (g), Minerals (g)

df = df[['name', 'url', 'category', 'sub_category', 'price', 'price_per_unit', 'price_unit', 'quantity', 'weight_per_unit', 'weight_unit', 'energy_kJ', 'energy_kcal', 'Fat (g)', 'of which saturates (g)', 'Carbohydrate (g)', 'of which sugars (g)', 'Fibre (g)', 'Protein (g)', 'Salt (g)', 'Fatlow (g)', 'of which saturateslow (g)', 'of which sugarsmedium (g)', 'Saltlow (g)', 'of which sugarshigh (g)', 'Saltmedium (g)', 'of which sugarslow (g)', 'Salthigh (g)', 'of which mono-unsaturates (g)', 'Fathigh (g)', 'of which saturatesmedium (g)', 'Fatmedium (g)', 'of which saturateshigh (g)', 'Sodium (g)', 'Energy value in kJ (g)', 'Energy value in kcal (g)', 'Vitamin B1 (thiamin) (g)', 'Vitamin B2 (riboflavin) (g)', 'Niacin (g)', 'Vitamin B6 (g)', 'folic acid / folate (g)', 'Iron (g)', 'Magnesium (g)', 'of which polyvalents alcohols (g)', 'Phosphorus (g)', 'Mangan (g)', 'Vitamin B12 (g)', 'Vitamin D (g)', 'Pantothenic acid (g)', 'Calcium (g)', 'folic acid (g)', 'Vitamin E (g)', 'Vitamin K (g)', 'Vitamin C (g)', 'Biotin (g)', 'Omega-3 fatty acids (g)', 'Potassium (g)', 'Zinc (g)', 'Alpha linolenic acid (omega-3 fatty acids) (g)', 'Water (g)', 'Omega-6 fatty acids (g)', 'Iodine (g)', 'Fluoride (g)', 'of which starch (g)', 'Vitamin A (g)', 'Cholesterin (g)', 'Selen (g)', 'Caffeine (g)', 'Copper (g)', 'Silicon (g)', 'Bicarbonate (or hydrogen carbonate) (g)', 'nitrate (g)', 'sulfate (g)', 'Chloride (g)', 'Sulphate 4+ (g)', 'Minerals (g)']]

In [378]:
# rename all columns to lowercase
df.columns = df.columns.str.lower()
df.columns

Index(['name', 'url', 'category', 'sub_category', 'price', 'price_per_unit',
       'price_unit', 'quantity', 'weight_per_unit', 'weight_unit', 'energy_kj',
       'energy_kcal', 'fat (g)', 'of which saturates (g)', 'carbohydrate (g)',
       'of which sugars (g)', 'fibre (g)', 'protein (g)', 'salt (g)',
       'fatlow (g)', 'of which saturateslow (g)', 'of which sugarsmedium (g)',
       'saltlow (g)', 'of which sugarshigh (g)', 'saltmedium (g)',
       'of which sugarslow (g)', 'salthigh (g)',
       'of which mono-unsaturates (g)', 'fathigh (g)',
       'of which saturatesmedium (g)', 'fatmedium (g)',
       'of which saturateshigh (g)', 'sodium (g)', 'energy value in kj (g)',
       'energy value in kcal (g)', 'vitamin b1 (thiamin) (g)',
       'vitamin b2 (riboflavin) (g)', 'niacin (g)', 'vitamin b6 (g)',
       'folic acid / folate (g)', 'iron (g)', 'magnesium (g)',
       'of which polyvalents alcohols (g)', 'phosphorus (g)', 'mangan (g)',
       'vitamin b12 (g)', 'vitamin d (g

In [379]:
# save final dataframe
df.to_csv('../data/final_dataset.csv', index=False)

In [380]:
# final check
len(df)

9602