# Scrape

### 1. Setup

The setup serves for:
- Importing libraries.
- Speficying utility functions.
- Setting up the chrome webdriver.
- Defining scraping constants.

In [1]:
# imports

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd
import random

In [2]:
# utils

def sleep():
    time.sleep(0.5 + random.random())

In [3]:
# set up ChromeDriver

options = Options()
options.add_argument('--headless')  # save GUI rendering time
options.add_argument('--disable-gpu')  # prevents errors in headless mode

service = Service('/usr/local/bin/chromedriver')
driver = webdriver.Chrome(service=service, options=options)

# open base page
BASE_URL = 'https://www.migros.ch/en'
driver.get(BASE_URL)
time.sleep(5)

In [18]:
# constants

# go to subcategory
SUBCAT_XPATH = '/html/body/app-root/div[1]/lsp-shop/div/div/div/div/ng-component/main/div/ng-component/mo-category-level-1-children-categories/div/ul/li[{}]/div/div/a'

# go to page
TEMPLATE_URL = '?page={}'

# hand picked from base url
CATEGORIES = [
    'fruits-vegetables',
    'bread-pastries-breakfast',
    'pasta-condiments-canned-food',
    'snacks-sweets',
    'frozen-food',
    'drinks-coffee-tea',
    'wine-beer-spirits'
]

### 2. Scrape links

Considerations and steps:
- Since all products have pages have the same template,
it is convenient separating the link retrieval and the product specifics scraping.
- In this first step we navigate to category, sub-category, and scrape all products links from the subcategories subpages.
- All links are saved a dataframe, and then permanently saved to csv. 
- The same dataframe will be used to store the product specifics in the next step.

In [9]:
# list of dictionaries 
data = []

# big giant massive loop
for cat in CATEGORIES:
    url = BASE_URL + '/category/' + cat
    driver.get(url) 

    sleep()

    # dynamically find subcategories (li[n] elements)
    subcategory_urls = []
    li_index = 1

    while True:
        sleep()
        subcat_xpath = SUBCAT_XPATH.format(li_index)
        try:
            subcategory = driver.find_element(By.XPATH, SUBCAT_XPATH)
            subcategory_url = subcategory.get_attribute('href')
            subcategory_urls.append(subcategory_url)

            print(f"Found subcategory link: {subcategory_url}")
            li_index += 1

        except NoSuchElementException:
            print("No more subcategory elements found.")
            break

    print(f"Subcategories found: {subcategory_urls}")

    for subcat_url in subcategory_urls:
        driver.get(subcat_url) 
        sleep()

        # extract products and go to the next page
        scrape_url = subcat_url + TEMPLATE_URL
        print(f"Scraping url: {scrape_url}")
        product_urls = []
        page = 1
        while True:
            sleep()
            url = scrape_url.format(page)
            driver.get(url)
            print(f"Scraping url: {url}")
            
            try:
                product_list = driver.find_elements(By.XPATH, '/html/body/app-root/div[1]/lsp-shop/div/div/div/div/ng-component/main/div/ng-component/div/mo-items-display/div/ul/li/article/div/div[1]/a[1]')
                if not product_list:
                    print(f"No more products found on page {page}. Exiting.")
                    break
                
                for product in product_list:
                    try:
                        product_url = product.get_attribute('href')
                        product_urls.append(product_url)
                    except Exception as e:
                        print(f"Error processing product: {e}")
                        continue
                
                print(f"Finished scraping page {page}.")
                page += 1
            
            except NoSuchElementException:
                print(f"Error on page {page}, stopping.")
                break

        # store
        current_data = {
            'category': cat,
            'sub_category': subcat_url.split('/')[-1],
            'url': product_urls
        }
        data.append(current_data)

        # report 
        print(f"Finished scraping subcategory {subcat_url}")
        print(f"Data: {current_data}")

Found subcategory link: https://www.migros.ch/en/category/fruits-vegetables/fruits
Found subcategory link: https://www.migros.ch/en/category/fruits-vegetables/vegetables
Found subcategory link: https://www.migros.ch/en/category/fruits-vegetables/root-vegetables
Found subcategory link: https://www.migros.ch/en/category/fruits-vegetables/salad
Found subcategory link: https://www.migros.ch/en/category/fruits-vegetables/fresh-herbs-spices
Found subcategory link: https://www.migros.ch/en/category/fruits-vegetables/ready-to-use
Found subcategory link: https://www.migros.ch/en/category/fruits-vegetables/vitamin-baskets
No more subcategory elements found.
Subcategories found: ['https://www.migros.ch/en/category/fruits-vegetables/fruits', 'https://www.migros.ch/en/category/fruits-vegetables/vegetables', 'https://www.migros.ch/en/category/fruits-vegetables/root-vegetables', 'https://www.migros.ch/en/category/fruits-vegetables/salad', 'https://www.migros.ch/en/category/fruits-vegetables/fresh-her

In [None]:
# convert to dataframe

df = pd.DataFrame(data)
df = df.explode('url')
df.to_csv('products.csv', index=False)

### 3. Scrape product information

In [None]:
df = pd.read_csv('data/products.csv')

from bs4 import BeautifulSoup

# Initialize a list to store all the product data
product_data = []

counter = 0
total = len(df)

# start from where we left off
start_index = 2337  
# big giant massive loop
for index, row in df.iloc[start_index:].iterrows():
    product_url = row['url']
    driver.get(product_url)
    sleep()  # Wait for the page to load

    try:
        # Extract the product name
        product_name = driver.find_element(By.CSS_SELECTOR, 'div.core-product-title h1').text

        # Extract the product price
        try:
            product_price = driver.find_element(By.CSS_SELECTOR, 'span.actual').text
        except Exception:
            product_price = "N/A"
            print(f"Product price not found for {product_name}")

        # Extract the product weight
        try:
            product_weight = driver.find_element(By.CSS_SELECTOR, 'span.weight-priceUnit').text
        except Exception:
            product_weight = "N/A"
            print(f"Product weight not found for {product_name}")

        # Extract the price per unit (e.g., per 100g)
        try:
            price_per_unit = driver.find_element(By.CSS_SELECTOR, 'span.listMode-priceUnit').text
        except Exception:
            price_per_unit = "N/A"
            print(f"Price per unit not found for {product_name}")

    
        ########
        # Initialize dictionary to store nutritional information
        nutrition_info = {}
        try:
            # Extract the full HTML of the table containing nutritional information
            table_html = driver.find_element(By.XPATH, '/html/body/app-root/div[1]/lsp-shop/div/div/div/div/mo-product-detail-container/main/div/mo-product-detail-page/div[1]/div[3]/mo-product-detail-information/mo-tab-layout/div/div/div[2]/mo-product-detail-nutrients-information/table').get_attribute('outerHTML')

            # Use BeautifulSoup to parse the HTML
            soup = BeautifulSoup(table_html, 'html.parser')

            # Find all rows in the table
            rows = soup.find('tbody').find_all('tr')

            # Loop through the rows and extract data
            for row in rows:
                try:
                    # Extract the nutrient name from the first column (td[1])
                    nutrient_name = row.find_all('td')[0].get_text(strip=True)

                    # Extract the nutrient value from the second column (td[2])
                    nutrient_value = row.find_all('td')[1].get_text(strip=True)

                    # Store the nutrient name and value in the dictionary
                    nutrition_info[nutrient_name] = nutrient_value

                except Exception as e:
                    print(f"Error processing row: {e}")
                    continue

        except Exception as e:
            print(f"Error extracting table.")
        ########

        # Store the scraped data in the list
        current_data = {
            'Product URL': product_url,
            'Name': product_name,
            'Weight': product_weight,
            'Price': product_price,
            'Price per 100g': price_per_unit,
            'Nutritional Information': nutrition_info
        }
        product_data.append(current_data)

        counter += 1

        print(f"{counter}/{total}: {current_data}")

    except Exception as e:
        print(f"Error processing product at {product_url}: {str(e)}")
        continue

1/9602: {'Product URL': 'https://www.migros.ch/en/product/113382210000', 'Name': 'Schoggimoussetorte', 'Weight': '950g', 'Price': '26.–', 'Price per 100g': '2.74/100g', 'Nutritional Information': {'Energy': '1291 kJ (308 kcal)', 'Fathigh': '18 g', 'of which saturatesmedium': '3.2 g', 'Carbohydrate': '33 g', 'of which sugarshigh': '28 g', 'Protein': '3.7 g', 'Saltlow': '0.1 g'}}
2/9602: {'Product URL': 'https://www.migros.ch/en/product/113313210000', 'Name': 'Fruchttorte', 'Weight': '580g', 'Price': '14.90', 'Price per 100g': '2.57/100g', 'Nutritional Information': {'Energy': '860 kJ (205 kcal)', 'Fatmedium': '9 g', 'of which saturateslow': '0.5 g', 'Carbohydrate': '28 g', 'of which sugarsmedium': '18 g', 'Protein': '3.6 g', 'Saltlow': '0 g'}}
3/9602: {'Product URL': 'https://www.migros.ch/en/product/113317000200', 'Name': 'Himbeerrahmtorte', 'Weight': '580g', 'Price': '14.90', 'Price per 100g': '2.57/100g', 'Nutritional Information': {'Energy': '729 kJ (174 kcal)', 'Fatmedium': '8.6 g'

In [50]:
# idx: 2337
product_data[-1]

{'Product URL': 'https://www.migros.ch/en/product/113804210000',
 'Name': 'Creme-Cornet',
 'Weight': '140g',
 'Price': '4.40',
 'Price per 100g': '3.14/100g',
 'Nutritional Information': {'Energy': '922 kJ (220 kcal)',
  'Fatmedium': '7.4 g',
  'of which saturateslow': '1.5 g',
  'Carbohydrate': '36 g',
  'of which sugarsmedium': '18 g',
  'Protein': '3.3 g',
  'Saltmedium': '0.4 g'}}

In [51]:
df = pd.DataFrame(product_data)
df.to_csv('product_data.csv', index=False)

df.head(10)

Unnamed: 0,Product URL,Name,Weight,Price,Price per 100g,Nutritional Information
0,https://www.migros.ch/en/product/265680802400,Migros Bio · blueberries,250g,7.9,3.16/100g,"{'Energy': '174 kJ (42 kcal)', 'Fat': '0.6 g',..."
1,https://www.migros.ch/en/product/264601302100,Dates,200g,5.95,2.98/100g,{}
2,https://www.migros.ch/en/product/264500313200,Avocado,,2.8,2.80/Piece,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',..."
3,https://www.migros.ch/en/product/264280101000,Bio Fairtrade · Bananas,1 kg,2.95,0.30/100g,"{'Energy': '403 kJ (95 kcal)', 'Fat': '< 0.5 g..."
4,https://www.migros.ch/en/product/264580113200,Migros Bio · Avocado,,2.8,2.80/Piece,"{'Energy': '643 kJ (156 kcal)', 'Fat': '14 g',..."
5,https://www.migros.ch/en/product/260283504100,Migros Bio · Apples · Season,,6.3,0.63/100g,"{'Energy': '232 kJ (55 kcal)', 'Fat': '< 0.5 g..."
6,https://www.migros.ch/en/product/265300902300,"Fresca · grapes · Coreless, white",500g,2.5,0.50/100g,"{'Energy': '293 kJ (69 kcal)', 'Fat': '< 0.5 g..."
7,https://www.migros.ch/en/product/264681502300,Migros Bio · dates · Organic,250g,3.95,1.58/100g,{}
8,https://www.migros.ch/en/product/264400902200,Kiwi · Gold,3 Stück,2.5,0.83/piece,"{'Energy': '229 kJ (54 kcal)', 'Fat': '0.6 g',..."
9,https://www.migros.ch/en/product/265606702300,Blueberries,500g,8.9,1.78/100g,"{'Energy': '174 kJ (42 kcal)', 'Fat': '0.6 g',..."
