# Scraping data from the "additives" website

## Import the required libraries

In [1]:
import pandas as pd
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager # sustituye al archivo
from selenium import webdriver
import time
import warnings
warnings.filterwarnings('ignore')
from selenium.webdriver.common.by import By # By es para buscar por tag, clase, id...
from fake_useragent import UserAgent
from selenium.webdriver.support.ui import WebDriverWait   # es para esperar
from selenium.webdriver.support import expected_conditions as EC  # condiciones esperadas...
from selenium.webdriver import ActionChains as AC   # acciones encadenadas, rollo doble click
from selenium.webdriver.common.keys import Keys  # manejar teclas


In [2]:
# Install and configure the Chrome web driver
PATH = ChromeDriverManager().install()

# Initialize the Chrome web driver
driver = webdriver.Chrome(PATH)


In [3]:
# Create an instance of the Options class
options=Options()

# Modify experimental options
options.add_experimental_option('excludeSwitches', ['enable-automation']) # Excluding the 'enable-automation' switch
options.add_experimental_option('useAutomationExtension', False) # Disabling the use of automation extension

options.headless=True    # Running the browser in headless mode 

options.add_argument('user-data-dir=cookies')    # Adding an argument for specifying the user data directory for cookies
options.add_argument('--incognito')              # Enabling incognito mode     

from fake_useragent import UserAgent  # Importing the UserAgent class from the fake_useragent library

user=UserAgent().random  # Creating a random user agent using the UserAgent class

print(user)

Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36


In [4]:
# Access the website for web scraping

driver.get("https://www.aditivos-alimentarios.com/") 

time.sleep(3)

In [5]:
# Find the elements 

rows = driver.find_elements(By.CLASS_NAME, "notranslate")

In [6]:
# Extract the elements and include the elements 

scraped_list=[]

for row in rows:
    a=row.text
    scraped_list.append(a)

scraped_list

["E100 Curcuminas BAJA\nE100i Curcumina BAJA\nE100ii Cúrcuma BAJA\nE101 Riboflavinas BAJA\nE101i Riboflavina BAJA\nE101ii Fosfato de Riboflavina BAJA\nE101iii Riboflavina de Bacillus Subtilis BAJA\nE101iv Riboflavina de Ashbya Gossypii BAJA\nE102 Tartracina ALTA\nE103 Crisoína ALTA\nE104 Amarillo de Quinoleína MEDIA\nE105 Amarillo Sólido ALTA\nE107 Amarillo 2G ALTA\nE110 Amarillo Ocaso FCF ALTA\nE111 Naranja GGN ALTA\nE120 Ácido Carmínico ALTA\nE120i Rojo Carmín ALTA\nE120ii Extracto de Cochinilla ALTA\nE121 Rojo Cítrico 2 ALTA\nE122 Azorrubina ALTA\nE123 Amaranto ALTA\nE124 Ponceau 4R ALTA\nE125 Ponceau SX ALTA\nE126 Ponceau 6R ALTA\nE127 Eritrosina ALTA\nE128 Rojo 2G ALTA\nE129 Rojo Allura AC ALTA\nE130 Azul de Antraquinona ALTA\nE131 Azul Patente V ALTA\nE132 Indigotina MEDIA\nE133 Azul Brillante FCF ALTA\nE134 Extracto de Espirulina En revisión\nE140 Clorofilas y Clorofilinas BAJA\nE140i Clorofila BAJA\nE140ii Clorofilina BAJA\nE141 Complejos Cúpricos de Clorofilas y Clorofilinas B

In [7]:
# Split the elements of the list

list_series=pd.Series(scraped_list)
list_series=list_series.str.split("\n")
list_series

0    [E100 Curcuminas BAJA, E100i Curcumina BAJA, E...
dtype: object

In [8]:
# Create a DataFrame with the list values

df = pd.DataFrame(list_series.explode())
df.head()

Unnamed: 0,0
0,E100 Curcuminas BAJA
0,E100i Curcumina BAJA
0,E100ii Cúrcuma BAJA
0,E101 Riboflavinas BAJA
0,E101i Riboflavina BAJA


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 751 entries, 0 to 0
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       751 non-null    object
dtypes: object(1)
memory usage: 11.7+ KB


In [10]:
# Rename columns 

df.rename(columns={0: 'new_name'}, inplace=True)

In [11]:
# Preparation of the DataFrame

import re

df_test=df.copy()

pattern = r'([A-Z]+)$'

# Apply regex patterns and create new columns
df_test['toxicity'] = df_test['new_name'].str.extract(pattern)
df_test

Unnamed: 0,new_name,toxicity
0,E100 Curcuminas BAJA,BAJA
0,E100i Curcumina BAJA,BAJA
0,E100ii Cúrcuma BAJA,BAJA
0,E101 Riboflavinas BAJA,BAJA
0,E101i Riboflavina BAJA,BAJA
...,...,...
0,E1518 Triacetato de Glicerilo BAJA,BAJA
0,E1519 Alcohol Bencílico MEDIA,MEDIA
0,E1520 Propilenglicol BAJA,BAJA
0,E1521 Polietilenglicol ALTA,ALTA


In [12]:
# Web scraping to extract more characteristics


tbody_element = driver.find_element(By.XPATH, '//tbody[@class="notranslate"]')

anchor_elements = tbody_element.find_elements(By.TAG_NAME, "a")

text_values = [element.text for element in anchor_elements]

print(text_values)

['E100', 'Curcuminas', 'E100i', 'Curcumina', 'E100ii', 'Cúrcuma', 'E101', 'Riboflavinas', 'E101i', 'Riboflavina', 'E101ii', 'Fosfato de Riboflavina', 'E101iii', 'Riboflavina de Bacillus Subtilis', 'E101iv', 'Riboflavina de Ashbya Gossypii', 'E102', 'Tartracina', 'E103', 'Crisoína', 'E104', 'Amarillo de Quinoleína', 'E105', 'Amarillo Sólido', 'E107', 'Amarillo 2G', 'E110', 'Amarillo Ocaso FCF', 'E111', 'Naranja GGN', 'E120', 'Ácido Carmínico', 'E120i', 'Rojo Carmín', 'E120ii', 'Extracto de Cochinilla', 'E121', 'Rojo Cítrico 2', 'E122', 'Azorrubina', 'E123', 'Amaranto', 'E124', 'Ponceau 4R', 'E125', 'Ponceau SX', 'E126', 'Ponceau 6R', 'E127', 'Eritrosina', 'E128', 'Rojo 2G', 'E129', 'Rojo Allura AC', 'E130', 'Azul de Antraquinona', 'E131', 'Azul Patente V', 'E132', 'Indigotina', 'E133', 'Azul Brillante FCF', 'E134', 'Extracto de Espirulina', 'E140', 'Clorofilas y Clorofilinas', 'E140i', 'Clorofila', 'E140ii', 'Clorofilina', 'E141', 'Complejos Cúpricos de Clorofilas y Clorofilinas', 'E141

In [13]:
second_scraping = pd.DataFrame(text_values)

In [14]:
second_scraping.head()

Unnamed: 0,0
0,E100
1,Curcuminas
2,E100i
3,Curcumina
4,E100ii


In [15]:
odd_rows = second_scraping.iloc[1::2]
odd_rows.reset_index(drop=True, inplace=True)
odd_rows

Unnamed: 0,0
0,Curcuminas
1,Curcumina
2,Cúrcuma
3,Riboflavinas
4,Riboflavina
...,...
746,Triacetato de Glicerilo
747,Alcohol Bencílico
748,Propilenglicol
749,Polietilenglicol


In [16]:
evens_rows = second_scraping.iloc[::2]
evens_rows.reset_index(drop=True, inplace=True)
evens_rows

Unnamed: 0,0
0,E100
1,E100i
2,E100ii
3,E101
4,E101i
...,...
746,E1518
747,E1519
748,E1520
749,E1521


In [17]:
additives = evens_rows.merge(odd_rows, left_index=True, right_index=True)
additives.head()

Unnamed: 0,0_x,0_y
0,E100,Curcuminas
1,E100i,Curcumina
2,E100ii,Cúrcuma
3,E101,Riboflavinas
4,E101i,Riboflavina


In [18]:
additives=additives.rename(columns={'0_x':'code', '0_y':'name'})
additives.head()

Unnamed: 0,code,name
0,E100,Curcuminas
1,E100i,Curcumina
2,E100ii,Cúrcuma
3,E101,Riboflavinas
4,E101i,Riboflavina


In [19]:
additives.shape

(751, 2)

In [20]:
df_test.shape


(751, 2)

In [21]:
# Fuse the 2 DataFrames

additives['toxicity'] = df_test['toxicity'].values

In [22]:
additives.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 751 entries, 0 to 750
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   code      751 non-null    object
 1   name      751 non-null    object
 2   toxicity  535 non-null    object
dtypes: object(3)
memory usage: 17.7+ KB


In [23]:
additives['toxicity']=additives['toxicity'].fillna('En Revisión')

In [24]:
additives.info()
additives.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 751 entries, 0 to 750
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   code      751 non-null    object
 1   name      751 non-null    object
 2   toxicity  751 non-null    object
dtypes: object(3)
memory usage: 17.7+ KB


Unnamed: 0,code,name,toxicity
0,E100,Curcuminas,BAJA
1,E100i,Curcumina,BAJA
2,E100ii,Cúrcuma,BAJA
3,E101,Riboflavinas,BAJA
4,E101i,Riboflavina,BAJA


In [26]:
additives.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 751 entries, 0 to 750
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   code      751 non-null    object
 1   name      751 non-null    object
 2   toxicity  751 non-null    object
dtypes: object(3)
memory usage: 17.7+ KB


In [27]:
# Transform the column names to strings

additives['name'] = additives['name'].astype(str)
additives['toxicity'] = additives['toxicity'].astype(str)
additives['code'] = additives['code'].astype(str)

additives.head()

Unnamed: 0,code,name,toxicity
0,E100,Curcuminas,BAJA
1,E100i,Curcumina,BAJA
2,E100ii,Cúrcuma,BAJA
3,E101,Riboflavinas,BAJA
4,E101i,Riboflavina,BAJA


In [28]:
additives.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 751 entries, 0 to 750
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   code      751 non-null    object
 1   name      751 non-null    object
 2   toxicity  751 non-null    object
dtypes: object(3)
memory usage: 17.7+ KB


In [35]:
# Change the name of the column

new_column_names = {
    "code": "additive_id"
}
additives.rename(columns=new_column_names, inplace=True)

In [49]:
additives["toxicity"] = additives["toxicity"].replace("BAJA","low")

additives["toxicity"] = additives["toxicity"].replace("ALTA","high")

additives["toxicity"] = additives["toxicity"].replace("MEDIA","moderate")

additives["toxicity"] = additives["toxicity"].replace("under revision","under review")

In [50]:
additives.head(50)

Unnamed: 0,additive_id,name,toxicity
0,E100,Curcuminas,low
1,E100i,Curcumina,low
2,E100ii,Cúrcuma,low
3,E101,Riboflavinas,low
4,E101i,Riboflavina,low
5,E101ii,Fosfato de Riboflavina,low
6,E101iii,Riboflavina de Bacillus Subtilis,low
7,E101iv,Riboflavina de Ashbya Gossypii,low
8,E102,Tartracina,high
9,E103,Crisoína,high


In [51]:
# Export the data to a csv. file

# additives.to_csv("/Users/david/Desktop/IronHack/Projects/food_advisor/cleaned/additives.csv", index=False)