# Scraping data from the "open_food_facts" website

## Import the required libraries


In [1]:
import pandas as pd
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager # sustituye al archivo
from selenium import webdriver
import time
import warnings
warnings.filterwarnings('ignore')
from selenium.webdriver.common.by import By # By es para buscar por tag, clase, id...
from fake_useragent import UserAgent
from selenium.webdriver.support.ui import WebDriverWait   # es para esperar
from selenium.webdriver.support import expected_conditions as EC  # condiciones esperadas...
from selenium.webdriver import ActionChains as AC   # acciones encadenadas, rollo doble click
from selenium.webdriver.common.keys import Keys  # manejar teclas


In [2]:
# Install and configure the Chrome web driver
PATH = ChromeDriverManager().install()

# Initialize the Chrome web driver
driver = webdriver.Chrome(PATH)

url = 'https://es.openfoodfacts.org/?sort_by=popularity'

# Conect to the URL
driver.get(url)

In [3]:
# Create an instance of the Options class
opciones=Options()

# Modify experimental options
opciones.add_experimental_option('excludeSwitches', ['enable-automation']) # Excluding the 'enable-automation' switch
opciones.add_experimental_option('useAutomationExtension', False) # Disabling the use of automation extension

opciones.headless=True   # Running the browser in headless mode 

opciones.add_argument('user-data-dir=cookies')  # Adding an argument for specifying the user data directory for cookies


opciones.add_argument('--incognito') # Enabling incognito mode          


from fake_useragent import UserAgent # Importing the UserAgent class from the fake_useragent library

usuario=UserAgent().random # Creating a random user agent using the UserAgent class



print(usuario)

Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67


In [4]:
# Scrape the links of the web, from the page 1 to 300. Each link contains the information of a food product.
# Add them to list1

list1 = []

for page_num in range(2, 300):
    

    parent_div = driver.find_element_by_css_selector('ul#products_match_all')

    # Find all the <a> elements inside the parent <div>
    links = parent_div.find_elements_by_tag_name('a')

    # Extract the href attribute from each link
    for link in links:
        href = link.get_attribute('href')
        list1.append(href)
        
    button = driver.find_element_by_css_selector(f'a[href="/{page_num}?sort_by=popularity"]')

    # Click the button
    button.click()


KeyboardInterrupt: 

In [None]:
len(list1)

In [None]:
# Scrape different types of information from each link
# Add the information of each product to "data_list"

from selenium.common.exceptions import NoSuchElementException

data_list = []

for e in list1:
    # Open the webpage

       
        
        driver.get(e)  # Replace with the actual webpage URL

       
        try:
            # Find the <h2> element with the specific class name
            h2_element = driver.find_element_by_css_selector('h2.title-1')

            # Extract the text content of the <h2> element
            product_name = h2_element.text
        
        except NoSuchElementException:
            product_name = 'unknown'
        
        
        try:
            # Find the <span> element with the specific id for brands
            span_brands = driver.find_element_by_id('field_brands_value')

            # Find the <a> elements within the <span> element for brands
            a_elements_brands = span_brands.find_elements_by_tag_name('a')

            # Extract the text content of each <a> element for brands
            brands = [a_brand.text for a_brand in a_elements_brands]
        
        except NoSuchElementException:
            brands = 'unknown'

        # Find the <span> element with the specific id for packaging
        # span_packaging = driver.find_element_by_id('field_packaging_value')

        # Find the <a> elements within the <span> element for packaging
        # a_elements_packaging = span_packaging.find_elements_by_tag_name('a')

        # Extract the text content of each <a> element for packaging
        # packaging = [a_packaging.text for a_packaging in a_elements_packaging]

        
        try:
            
            # Find the <a> element within the <a> element with href="#panel_nutriscore_content"
            nutri_a_element = driver.find_element_by_css_selector('a[href="#panel_nutriscore_content"]')

            # Extract the "grade" class attribute of the <a> element for Nutri-Score
            nutri_score = nutri_a_element.get_attribute("class").split()[1].replace("grade_", "").upper()
        
        except NoSuchElementException:
            nutri_score = 'unknown'
       
    
    
        try:
            # Find the <a> element within the <a> element with href="#panel_nova_content"
            nova_a_element = driver.find_element_by_css_selector('a[href="#panel_nova_content"]')

            # Extract the text content of the <h4> element within the <a> element for NOVA 4
            nova_4 = nova_a_element.find_element_by_tag_name('h4').text
        
        except NoSuchElementException:
            nova_4 = 'unknown'

            
            
        try:
            
            # Find the <span> element with the specific id for countries
            span_countries = driver.find_element_by_id('field_countries_value')

            # Find the <a> elements within the <span> element for countries
            a_elements_countries = span_countries.find_elements_by_tag_name('a')

            # Extract the text content of each <a> element for countries
            countries = [a_country.text for a_country in a_elements_countries]
        
        except NoSuchElementException:
            countries = 'unknown'

            
            
        try:
            #Find the <div> element with the specific id for additives content
            div_additives_content = driver.find_element_by_id('panel_additives_content')

            #Find all the <h4> elements within the <div> element for additives content
            h4_elements_additives = div_additives_content.find_elements_by_tag_name('h4')

            #Extract the additive information from the <h4> elements
            additives = [h4_additive.text for h4_additive in h4_elements_additives]
            
        except NoSuchElementException:
            additives = 'unknown'

        
        try:
            # Find the <a> element with the specific id for Eco-Score
            a_ecoscore = driver.find_element_by_css_selector('a[href="#panel_ecoscore"]')

            # Extract the Eco-Score information from the <a> element
            eco_score = a_ecoscore.find_element_by_css_selector('h4.attr_title').text
        
        except NoSuchElementException:
            eco_score = 'unknown'

        # Create a table
        current_data = {
            'Product Name': product_name,
            'Brands': ', '.join(brands),
            #'Packaging': ', '.join(packaging),
            'NOVA 4': nova_4,
            'Nutri-Score': nutri_score,
            'Countries': ', '.join(countries),
            'Additives': ', '.join(additives),
            'Eco-Score': eco_score

        }

        from bs4 import BeautifulSoup
        
        try:
            # Extract the page source
            page_source = driver.page_source

            # Parse the HTML data
            soup = BeautifulSoup(page_source, 'html.parser')

            # Find the <tbody> element within the table
            tbody_element = soup.find('table', attrs={'aria-label': 'Información nutricional'}).find('tbody')

            # Create a dictionary to store the extracted values
            extracted_data = {}

            # Find the rows with the desired keys
            for row in tbody_element.find_all('tr'):
                key_element = row.find('td').find('span')
                key = key_element.text.strip()

                # Check if the current row contains 'Energía', 'Grasas saturadas', or 'Azúcares'
                if key == 'Energía' or key == 'Grasas saturadas' or key == 'Azúcares':
                    value_element = row.find_all('td')[1].find('span')
                    value = value_element.text.strip()
                    extracted_data[key] = value

            # Add the extracted values to the existing data dictionary
            current_data['Energía'] = extracted_data['Energía']
            current_data['Grasas saturadas'] = extracted_data['Grasas saturadas']
            current_data['Azúcares'] = extracted_data['Azúcares']
        
        
        except Exception as e:
        # Handle the exception
            pass
        
        data_list.append(current_data)

In [None]:
# Transform the list into a DataFrame

df = pd.DataFrame(data_list)

In [None]:
# Export the DataFrame as a .csv file

# df.to_csv("/Users/david/Desktop/IronHack/Projects/food_advisor/raw/n1_300.csv")

In [None]:
df.info()