# Similarity recommendation module
We apply text cleaning and Word2vec model on recipes in order to find similarities between ingredients.  
We use three datasets : 
- Open food fact
- Agribalyze
- ADEME - Base Carbon  
  
The recommandation's filtering is the following :  
- among the top 20 most similar ingredients
- only ingredients having a lower carbon emission than the initial one
- ingredient of the same category first  
- then ordered by carbon emission factor  
  

# Packages importations

In [31]:
! pip install --upgrade nltk



In [199]:
import pandas as pd
pd.set_option('max_column', None)
from tqdm import tqdm
import numpy as np
import unidecode
from ast import literal_eval

In [70]:
from pywsd.utils import lemmatize_sentence

In [71]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from gensim.models import Word2Vec, keyedvectors

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jupyter/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Recipe preprocessing
## Open food facts data importation

In [36]:
from google.cloud import bigquery

bqclient = bigquery.Client()

# Download query results.
query_string = """
SELECT ingredients_text, ingredients_tags, product_name
FROM `hackathon-climat-01.hackathon_bq.off_raw`
WHERE countries_en = "France"
"""

df = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(
        # Optionally, explicitly request to use the BigQuery Storage API. As of
        # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
        # API is used by default.
        create_bqstorage_client=True,
    )
)
print(df.shape)
df.head(2)

(829870, 3)


Unnamed: 0,ingredients_text,ingredients_tags,product_name
0,"ingred/eivts: sugar, chocolate,• cocoa butter,...","en:ingred,en:eivts,en:chocolate,en:cocoa-butte...",Hershey's Special Dark Mildly Sweet Chocolate ...
1,sorbitol,en:e420,Ice breakers sours


## Preprocessing

In [37]:
# Remove duplicates$
df = df.dropna(subset=['ingredients_text'])
df.head(2)

Unnamed: 0,ingredients_text,ingredients_tags,product_name
0,"ingred/eivts: sugar, chocolate,• cocoa butter,...","en:ingred,en:eivts,en:chocolate,en:cocoa-butte...",Hershey's Special Dark Mildly Sweet Chocolate ...
1,sorbitol,en:e420,Ice breakers sours


In [38]:
df.to_csv('data/test_recette.csv')

In [39]:
df1 = df.ingredients_text

# Remove capital letters
df2 = df1.apply(lambda x: x.lower())

# Remove all inside brackets
import re 
df2 = df2.apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", x))

# Remove some characters
df2 = df2.apply(lambda x: x.replace("%", "")
                .replace("_", "")
                .replace(".", "")
               .replace("*", "")
               .replace(")", ""))

# Take the punctuations out
df2 = pd.DataFrame(df2.apply(lambda x: re.split(',|et', x))).reset_index(drop=True)
df2.head(2)

Unnamed: 0,ingredients_text
0,"[ingred/eivts: sugar, chocolate, • cocoa butt..."
1,[sorbitol]


In [40]:
# Take the stopwords out 
stop_words = stopwords.words("french")
stop_words += ['kg', 'g']
df2 = df2['ingredients_text'].apply(lambda vec: list(filter(lambda a: a not in stop_words, vec)))
df2.head(2)

0    [ingred/eivts: sugar,  chocolate, • cocoa butt...
1                                           [sorbitol]
Name: ingredients_text, dtype: object

In [41]:
# Remove numbers, accents and spaces
#df3 = df2.apply(lambda vec: list(map(lambda ing: unidecode.unidecode(''.join([i for i in ing if not i.isdigit()])), vec)))
df32 = df2.apply(lambda vec: list(map(lambda ing: unidecode.unidecode(''.join([i for i in ing if not i.isdigit()]).strip()), vec)))

df32.head(2)

0    [ingred/eivts: sugar, chocolate, * cocoa butte...
1                                           [sorbitol]
Name: ingredients_text, dtype: object

## We filter the ingredient before building the model : we only keep the ones which we have the carbon emissions of. 

In [42]:
total_list = [x for xs in list(df32) for x in xs]

In [43]:
# Importation of the carbon emission dataset that is generatd at the end of this notebook
data_carbone_total = pd.read_csv('data/data_carbon_total_full_base_carb.csv')
print(data_carbone_total.shape)
data_carbone_total.head(2)

(1408, 3)


Unnamed: 0,ingredients,empreinte_carbone,categorie
0,abats de boeuf,8.042534,"['viandes, œufs, poissons', 'entrées et plats ..."
1,abats de porc,0.516662,"['aides culinaires et ingrédients divers', 'en..."


In [44]:
# Only keep ingredients that are on agribalyze's or on ADEME base carbon's datasets
df33 = df32.copy()
for recette_id in tqdm(range(len(df33))):
    new=[]
    for item in df33[recette_id]:
        if item in list(data_carbone_total.ingredients):
            new += [item]
    df33[recette_id] = new
print(df33.shape)
df33.head(3)

100%|██████████| 222420/222420 [03:56<00:00, 939.50it/s] 

(222420,)





0    []
1    []
2    []
Name: ingredients_text, dtype: object

In [46]:
# Add names of the recipes
df34 = pd.DataFrame(df33)
df34['nom_recette'] = df.product_name.reset_index(drop=True)
df34.head(3)

Unnamed: 0,ingredients_text,nom_recette
0,[],Hershey's Special Dark Mildly Sweet Chocolate ...
1,[],Ice breakers sours
2,[],Kimlan Grade-A Dark Soy Sauce
3,[miel],Croquants amandes & miel
4,[],Divella Stelline No. 74


In [47]:
df34[df34.nom_recette=='Cocktails du Monde Goyave - Mangue & Litchi']

Unnamed: 0,ingredients_text,nom_recette
12321,"[goyave, raisin]",Cocktails du Monde Goyave - Mangue & Litchi


In [48]:
# We remove recipes with less than 2 ingredients
non_empty = []
for k in tqdm(df34.ingredients_text):
    non_empty+=[len(k)>1]
df34 = pd.DataFrame(df34[non_empty])
print(df34.shape)
df34.head(3)

100%|██████████| 222420/222420 [00:00<00:00, 1589027.00it/s]

(79077, 2)





Unnamed: 0,ingredients_text,nom_recette
24,"[gelatine, sel]",
33,"[farine de mais, huile de soja, chia, lecithin...",Gauffres Chia plus
53,"[celeri, moutarde]","Poivre noir d'Indonésie, recharge spéciale moulin"
55,"[romarin, thym, basilic, origan, estragon]",Herbes de Provence FUCHS
60,"[creme, sel]",


In [51]:
# We save the dataset of recipes
df34.to_csv('data/recettes.csv', index=False, header=True)

## Lematization/stemming

In [19]:
#from nltk.stem import PorterStemmer
#from nltk.stem.snowball import SnowballStemmer

#st = PorterStemmer()
#stemmer = SnowballStemmer("french")

#df35 = df3.apply(lambda vec: list(map(lambda x: ' '.join([stemmer.stem(w) for w in x.split()]), vec)))
#df35 = df3.apply(lambda vec: list(map(lambda x: ' '.join([st.stem(w) for w in x.split()]), vec)))
#df35 = df3.apply(lambda vec: list(map(lambda x: ' '.join(lemmatize_sentence(x)), vec))) -->too long

#df35.head()

Stemming/lematizing only doesn't improve the performances.  
**To try** : first stem and then reassociate the most frequent original string to the stem. 

## Word2Vec

### Similarity model

In [160]:
model = Word2Vec(sentences=df34.ingredients_text, min_count=2)
len(model.wv)

480

In [161]:
# Saving the model
model.wv.save('model1.kvmodel')

### Test similarity and add carbon emissions

In [328]:
model_loaded = keyedvectors.KeyedVectors.load('model1.kvmodel')

ingredient = 'lait en poudre'
top_sim = pd.DataFrame(model_loaded.most_similar(ingredient, topn=20), columns=['ingredient', 'similarity'])
top_sim

Unnamed: 0,ingredient,similarity
0,lecithine de soja,0.726613
1,chocolat noir,0.706182
2,creme de lait,0.67507
3,cacao,0.660649
4,poudre de cacao,0.655465
5,croissant,0.651631
6,graisse de porc,0.642923
7,chocolat au lait,0.641383
8,chocolat blanc,0.640538
9,sauce au chocolat,0.639219


In [123]:
# We keep the reco within 40% similarity coeff of the best reco
top_sim = top_sim[top_sim.similarity>(max(top_sim.similarity)-max(top_sim.similarity)*0.4)]
top_sim

Unnamed: 0,ingredient,similarity
0,poudre de cacao,0.979488
1,chocolat blanc,0.974453
2,chocolat au lait,0.972252
3,lecithine de soja,0.916535
4,beurre de cacao,0.912039
5,pate d'amande,0.884008
6,cacao,0.880927
7,bicarbonate de soude,0.878144
8,lait concentre,0.85231
9,gaufrette,0.832883


In [124]:
# Add carbon emissions to the result
data_carbone_total_loaded = pd.read_csv('data/data_carbon_total_full_base_carb.csv')
data_carbone_total_loaded.head(2)

Unnamed: 0,ingredients,empreinte_carbone,categorie
0,abats de boeuf,8.042534,"['viandes, œufs, poissons', 'entrées et plats ..."
1,abats de porc,0.516662,"['aides culinaires et ingrédients divers', 'en..."


In [125]:
carbon_emission = []
categorie = []
for sim_ind in range(top_sim.shape[0]):
    ing = top_sim.ingredient[sim_ind]
    carbon_emission += list(data_carbone_total_loaded[data_carbone_total_loaded.ingredients==ing].empreinte_carbone.values)
    categorie += list(data_carbone_total_loaded[data_carbone_total_loaded.ingredients==ing].categorie.values)
top_sim['carbon_emission'] = carbon_emission
top_sim['categorie'] = categorie

top_sim

Unnamed: 0,ingredient,similarity,carbon_emission,categorie
0,poudre de cacao,0.979488,2.702175,"['boissons', 'lait et produits laitiers', 'pro..."
1,chocolat blanc,0.974453,193.0,
2,chocolat au lait,0.972252,6.065865,"['produits céréaliers', 'produits sucrés']"
3,lecithine de soja,0.916535,352.0,
4,beurre de cacao,0.912039,266.0,
5,pate d'amande,0.884008,387.0,
6,cacao,0.880927,140.677707,['produits sucrés']
7,bicarbonate de soude,0.878144,119.0,
8,lait concentre,0.85231,164.58653,['produits sucrés']
9,gaufrette,0.832883,444.0,


## Re-ordering of the recommandations
- same category first 
- then less emissions

In [68]:
cat = data_carbone_total_loaded[data_carbone_total_loaded.ingredients==ingredient].categorie
cat

798    ['aides culinaires et ingrédients divers', 'en...
Name: categorie, dtype: object

In [126]:
# if no category, just order by carbon emission
if pd.DataFrame(cat).categorie.isna().all():
    final_reco = top_sim.sort_values('carbon_emission')
    
else:
    # Reco with a categorie
    with_cat = top_sim[top_sim.categorie.notna()]

    # Reco with the same categorie
    with_same_cat = with_cat[np.array(with_cat.categorie.apply(lambda x: np.product(np.array([c in x for c in cat]))))==1]
    with_same_cat = with_same_cat.sort_values('carbon_emission')

    # Rest
    rest = top_sim.drop(with_same_cat.index)
    rest = rest.sort_values('carbon_emission')
    
    # Combination
    final_reco = pd.concat([with_same_cat, rest], axis=0)
final_reco

Unnamed: 0,ingredient,similarity,carbon_emission,categorie
19,cassonade,0.769728,0.07136,"['aides culinaires et ingrédients divers', 'bo..."
11,fruits confits,0.81985,0.107657,['produits céréaliers']
0,poudre de cacao,0.979488,2.702175,"['boissons', 'lait et produits laitiers', 'pro..."
2,chocolat au lait,0.972252,6.065865,"['produits céréaliers', 'produits sucrés']"
7,bicarbonate de soude,0.878144,119.0,
13,cafe,0.81002,132.402655,"['boissons', 'lait et produits laitiers']"
6,cacao,0.880927,140.677707,['produits sucrés']
15,meringue,0.794014,152.0,
12,creme patissiere,0.815276,155.0,
8,lait concentre,0.85231,164.58653,['produits sucrés']


## Final function

In [210]:
def isNaN(string):
    return string != string

In [266]:
def reco(ingredient='fraise'):  
    data_carbone_total_loaded = pd.read_csv('data/data_carbon_total_full_base_carb.csv')

    # Information on the original product
    info_origin = pd.DataFrame(data_carbone_total_loaded[data_carbone_total_loaded.ingredients==ingredient]).reset_index(drop=True)
    emission_origin = info_origin.empreinte_carbone[0]
    try:
        categorie_origin = literal_eval(info_origin.categorie[0])
    
    except:
        categorie_origin = np.nan
    
    # Loading word2vec model
    saved_model = keyedvectors.KeyedVectors.load('model1.kvmodel')
    
    # Find similarities
    top_sim = pd.DataFrame(saved_model.most_similar(ingredient, topn=20), columns=['ingredient', 'similarity'])
    
    # We keep the reco within 40% similarity coeff of the best reco
    top_sim = top_sim[top_sim.similarity>(max(top_sim.similarity)-max(top_sim.similarity)*0.4)]
    
    # Add carbon emissions
    carbon_emission = []
    categorie = []
    for sim_ind in range(top_sim.shape[0]):
        ing = top_sim.ingredient[sim_ind]
        carbon_emission += list(data_carbone_total_loaded[data_carbone_total_loaded.ingredients==ing].empreinte_carbone.values)
        categorie += list(data_carbone_total_loaded[data_carbone_total_loaded.ingredients==ing].categorie.values)
    top_sim['carbon_emission'] = carbon_emission
    top_sim['categorie'] = categorie

    # We keep reco with less emission that the original product
    top_sim = top_sim[top_sim.carbon_emission<emission_origin]
    
    # Info on the original ingredient to add
    to_add_origin = pd.DataFrame({"ingredient":[ingredient], 
                                  "similarity": [1], 
                                  "carbon_emission":[emission_origin],
                                  "categorie":[categorie_origin]})

    try:
        # Reco with a categorie
        with_cat = top_sim[top_sim.categorie.notna()]
        
        # Reco with the same categorie
        with_same_cat = with_cat[np.array(with_cat.categorie.apply(lambda x: np.sum(np.array([c in x for c in categorie_origin]))))>1]
        with_same_cat = with_same_cat.sort_values('carbon_emission')

        # Rest
        rest = top_sim.drop(with_same_cat.index)
        rest = rest.sort_values('carbon_emission')
        
        # Add info on the original ingredient
        final_reco = pd.concat([with_same_cat, rest, to_add_origin], axis=0)
        
    except:
        # If no categorie
        final_reco = top_sim.sort_values('carbon_emission')
        final_reco = pd.concat([final_reco, to_add_origin], axis=0)
        
    return final_reco

In [331]:
# Test
import numpy as np
ingredient = "boeuf"
reco(ingredient)

  ingredients  empreinte_carbone categorie
0       boeuf           0.351268       NaN


Unnamed: 0,ingredient,similarity,carbon_emission,categorie
14,champignon,0.876666,0.033373,"['viandes, œufs, poissons', 'aides culinaires ..."
8,lentilles,0.905045,0.041254,"['entrées et plats composés', 'produits céréal..."
4,saucisse de toulouse,0.911495,0.089,
16,jambon de bayonne,0.876282,0.126,
18,asperge,0.87315,0.198333,
19,chou vert,0.871111,0.203507,['entrées et plats composés']
5,foie,0.911106,0.210813,
1,riz blanc,0.918752,0.2425,
12,rutabaga,0.885348,0.246,
11,croutons,0.886232,0.256,


# Carbon emissions
## Add Agribalyze's data

In [146]:
from google.cloud import bigquery

bqclient = bigquery.Client()

# Download query results.
query_string = """
SELECT Nom_Fran__ais, Ingredients, Changement_climatique__kg_CO2_eq_kg_de_produit_, Groupe_d_aliment
FROM `hackathon-climat-01.hackathon_bq.agribalise`
"""

agri = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(
        # Optionally, explicitly request to use the BigQuery Storage API. As of
        # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
        # API is used by default.
        create_bqstorage_client=True,
    )
)
print(agri.shape)
agri.head(2)

(5671, 4)


Unnamed: 0,Nom_Fran__ais,Ingredients,Changement_climatique__kg_CO2_eq_kg_de_produit_,Groupe_d_aliment
0,"Oeuf, brouillé, avec matière grasse",Autres étapes,0.652739,"viandes, œufs, poissons"
1,Omelette au fromage,Autres étapes,0.640947,"viandes, œufs, poissons"


## Preprocessing

In [147]:
# to lower and remove accents
agri.Ingredients = agri.Ingredients.apply(lambda x: unidecode.unidecode(x.lower()))

In [148]:
agri.to_csv('data/agri.csv', header=True, index=False)

### Checking the overlapping of the datasets

In [149]:
list_ing = agri.Ingredients.unique()
list_ing = list(map(lambda x: x.lower(), list_ing))
list_ing[:5]

['autres etapes',
 'emmental',
 'oeuf de poule',
 'oignons',
 'viande de porc maigre']

In [163]:
import numpy as np
check1 = [item in list_ing for item in model.wv.index_to_key]
check2 = [item in model.wv.index_to_key for item in list_ing]

print(f"Overlapping of open food facts data over Agribalyze's {100*np.array(check2).sum()/len(check2)}%.")

Recouvrement de la data open food facts sur la data Agribalyze 73.70892018779342%.


# Add base carbone

In [164]:
abc = pd.read_csv('data/data_base_carbon.csv', )
print(abc.shape)
abc.head(2)

(1869, 67)


Unnamed: 0,Type_Ligne,Identifiant_de_l___l__ment,Structure,Type_de_l___l__ment,Statut_de_l___l__ment,Nom_base_fran__ais,Nom_base_anglais,Nom_base_espagnol,Nom_attribut_fran__ais,Nom_attribut_anglais,Nom_attribut_espagnol,Nom_fronti__re_fran__ais,Nom_fronti__re_anglais,Nom_fronti__re_espagnol,Code_de_la_cat__gorie,Tags_fran__ais,Tags_anglais,Tags_espagnol,Unit___fran__ais,Unit___anglais,Unit___espagnol,Contributeur,Autres_Contributeurs,Programme,Url_du_programme,Source,Localisation_g__ographique,Sous_localisation_g__ographique_fran__ais,Sous_localisation_g__ographique_anglais,Sous_localisation_g__ographique_espagnol,Date_de_cr__ation,Date_de_modification,P__riode_de_validit__,Incertitude,R__glementations,Transparence,Qualit__,Qualit___TeR,Qualit___GR,Qualit___TiR,Qualit___C,Qualit___P,Qualit___M,Commentaire_fran__ais,Commentaire_anglais,Commentaire_espagnol,Type_poste,Nom_poste_fran__ais,Nom_poste_anglais,Nom_poste_espagnol,Total_poste_non_d__compos__,CO2f,CH4f,CH4b,N2O,Code_gaz_suppl__mentaire_1,Valeur_gaz_suppl__mentaire_1,Code_gaz_suppl__mentaire_2,Valeur_gaz_suppl__mentaire_2,Code_gaz_suppl__mentaire_3,Valeur_gaz_suppl__mentaire_3,Code_gaz_suppl__mentaire_4,Valeur_gaz_suppl__mentaire_4,Code_gaz_suppl__mentaire_5,Valeur_gaz_suppl__mentaire_5,Autres_GES,CO2b
0,Elément,34052,élément non décomposé,Facteur d'émission,Valide générique,salade cesar au poulet (salade verte,Caesar's salad (salad,,"fromage, croûtos, sauce)","chicken, croûtons, sauce)",,,,,"Achats de biens > Produits agro-alimentaires, ...","Agribalyse,""\tSalade César au poulet (salade v...","Agribalyse,,starters and mixed dishes,Caesar's...",,kgCO2e/kg de poids net,kgCO2e/kg (net weight),,AGRIBALYSE,,AGRIBALYSE,https://app.agribalyse.fr/,https://ecolab.gitbook.io/documentation-agriba...,France continentale,,,,2021-03-04,2021-03-04,31/12/2023,,,5.0,,4.0,,,,,,,,,,,,,281.0,,,,,,,,,,,,,,,,
1,Elément,33558,élément non décomposé,Facteur d'émission,Valide générique,brioche fourree creme patissiere,Brioche,,préemballée,"filled with custard (Chinese brioche type), pr...",,,,,"Achats de biens > Produits agro-alimentaires, ...","Agribalyse,produits céréaliers,""Brioche fourré...","Agribalyse,Brioche,cereals products,pastries",,kgCO2e/kg de poids net,kgCO2e/kg (net weight),,AGRIBALYSE,,AGRIBALYSE,https://app.agribalyse.fr/,https://ecolab.gitbook.io/documentation-agriba...,France continentale,,,,2021-03-04,2021-03-04,31/12/2023,,,5.0,,4.0,,,,,,,,,,,,,383.0,,,,,,,,,,,,,,,,


## Preprocessing

In [165]:
# Changer l'unité de gCO2eq/kg à kgco2eq/kg
abc.Total_poste_non_d__compos__ = abc.Total_poste_non_d__compos__/1000

In [166]:
list_ing_abc = abc.Nom_base_fran__ais.unique()
list_ing_abc = list(map(lambda x: x.lower(), list_ing_abc))
list_ing_abc[:5]

['salade cesar au poulet (salade verte',
 'brioche fourree creme patissiere ',
 'gouter sec fourre  parfum chocolat',
 'gouter sec fourre  parfum fruits',
 'gouter sec fourre  parfum lait ou vanille']

In [167]:
import numpy as np
check1 = [item in list_ing_abc for item in list(model.wv.index_to_key)]
check2 = [item in model.wv.index_to_key for item in list_ing_abc]

print(f"Overlapping of open food facts data over Agribalise's {np.array(check1).sum()/len(check1)}.")

Recouvrement de la data open food facts sur la data Agribalise 0.8520833333333333.


## Merge agribalyze and base carbon

In [168]:
agri_concat = agri.rename(columns={'Nom_Fran__ais':'nom', 
                                 'Ingredients':'ingredients', 
                                 'Changement_climatique__kg_CO2_eq_kg_de_produit_':'empreinte_carbone',
                                  'Groupe_d_aliment' : 'categorie'})
agri_concat = agri_concat[['ingredients', 'empreinte_carbone', 'categorie']]

abc_concat = abc.rename(columns={'Nom_base_fran__ais':'ingredients', 
                                'Total_poste_non_d__compos__':'empreinte_carbone'})
abc_concat = abc_concat[['ingredients', 'empreinte_carbone']]

In [169]:
# On prend que le premier mot de la description qui souvent désigne l'ingrédient, on moyennera par la suite
abc_concat2 = abc_concat.copy()
abc_concat2.ingredients = abc_concat.ingredients.apply(lambda x: x.split(' ')[0] if x.split(' ')[0] in model.wv.index_to_key else x)

In [172]:
data_carbone_total = pd.concat([agri_concat, abc_concat], axis=0)
data_carbone_total.head(3)

Unnamed: 0,ingredients,empreinte_carbone,categorie
0,autres etapes,0.652739,"viandes, œufs, poissons"
1,autres etapes,0.640947,"viandes, œufs, poissons"
2,autres etapes,0.640947,"viandes, œufs, poissons"
3,autres etapes,0.640947,"viandes, œufs, poissons"
4,autres etapes,0.640947,"viandes, œufs, poissons"
...,...,...,...
1864,yaourt au lait de chevre,0.187000,
1865,"yaourt, lait fermente ou specialite laitiere",0.167000,
1866,"yaourt, lait fermente ou specialite laitiere",0.244000,
1867,"yaourt, lait fermente ou specialite laitiere",0.025000,


In [173]:
data_carbone_total2 = data_carbone_total.copy()
data_carbone_total2.categorie = data_carbone_total2.categorie.fillna('NaN')
data_carbone_total2 = pd.DataFrame(data_carbone_total2.groupby('ingredients')['categorie'].unique().transform(lambda x: '^'.join(x)))
data_carbone_total2.categorie = data_carbone_total2.categorie.apply(lambda x: x.split('^'))
data_carbone_total2 = data_carbone_total2.reset_index()
data_carbone_total2.head(3)

Unnamed: 0,ingredients,categorie
0,abats de boeuf,"[viandes, œufs, poissons, entrées et plats com..."
1,abats de porc,"[aides culinaires et ingrédients divers, entré..."
2,abats de poulet,"[entrées et plats composés, viandes, œufs, poi..."
3,abats de veau,[entrées et plats composés]
4,abondance,[NaN]
...,...,...
1403,yakitori,[NaN]
1404,yaourt,"[aides culinaires et ingrédients divers, NaN]"
1405,yaourt a la grecque,[NaN]
1406,yaourt au lait de chevre,[NaN]


In [179]:
data_carbone_total3 = data_carbone_total \
    .groupby(['ingredients']) \
    .agg({'empreinte_carbone': lambda x: x.mean(skipna=False)})
data_carbone_total3 = data_carbone_total3.reset_index()
data_carbone_total3.head(3)

Unnamed: 0,ingredients,empreinte_carbone
0,abats de boeuf,8.042534
1,abats de porc,0.516662
2,abats de poulet,0.150089
3,abats de veau,4.440226
4,abondance,0.627000
...,...,...
1403,yakitori,0.624000
1404,yaourt,0.243277
1405,yaourt a la grecque,0.250667
1406,yaourt au lait de chevre,0.187000


In [180]:
data_carbone_total_final = data_carbone_total3.merge(data_carbone_total2, how='inner', on='ingredients')
data_carbone_total_final.head(3)

Unnamed: 0,ingredients,empreinte_carbone,categorie
0,abats de boeuf,8.042534,"[viandes, œufs, poissons, entrées et plats com..."
1,abats de porc,0.516662,"[aides culinaires et ingrédients divers, entré..."
2,abats de poulet,0.150089,"[entrées et plats composés, viandes, œufs, poi..."
3,abats de veau,4.440226,[entrées et plats composés]
4,abondance,0.627000,[NaN]
...,...,...,...
1403,yakitori,0.624000,[NaN]
1404,yaourt,0.243277,"[aides culinaires et ingrédients divers, NaN]"
1405,yaourt a la grecque,0.250667,[NaN]
1406,yaourt au lait de chevre,0.187000,[NaN]


In [181]:
# Get rid of nan values
data_carbone_total_final.categorie = data_carbone_total_final.categorie.apply(lambda x: [cat for cat in x if cat!='NaN'])
data_carbone_total_final.categorie = data_carbone_total_final.categorie.apply(lambda x: np.nan if len(x)==0 else x)
data_carbone_total_final.head(3)

Unnamed: 0,ingredients,empreinte_carbone,categorie
0,abats de boeuf,8.042534,"[viandes, œufs, poissons, entrées et plats com..."
1,abats de porc,0.516662,"[aides culinaires et ingrédients divers, entré..."
2,abats de poulet,0.150089,"[entrées et plats composés, viandes, œufs, poi..."
3,abats de veau,4.440226,[entrées et plats composés]
4,abondance,0.627000,
...,...,...,...
1403,yakitori,0.624000,
1404,yaourt,0.243277,[aides culinaires et ingrédients divers]
1405,yaourt a la grecque,0.250667,
1406,yaourt au lait de chevre,0.187000,


In [194]:
# Save the table
data_carbone_total_final.to_csv('data/data_carbon_total_full_base_carb.csv', index=False)

## Checking of the coverage of the dataset

In [63]:
import numpy as np
check = [item in list(data_carbone_total.ingredients) for item in model.wv.index_to_key]

print(f'Recouvrement de la data de notre modèle par la data carbone: {100*np.array(check).sum()/len(check)}%.')
print(f"Nombre d'ingrédients couverts par le module de similarité et par l'émission carbone: {sum(check)}.")

Recouvrement de la data de notre modèle par la data carbone: 100.0%.
Nombre d'ingrédients couverts par le module de similarité et par l'émission carbone: 442.
