# Génération d'un fichier csv 
### Ce script doit générer un fichier csv contenant des données propres, qui seront par la suite retravaillées pour faire de la prédictions d'ingrédients ou de catégorie dans l'objectif de prédire l'écoscore des produits. 

In [1]:
import tensorflow as tf

def setup_gpu():
    physical_devices = tf.config.list_physical_devices('GPU')
    if physical_devices:
        try:
            tf.config.experimental.set_memory_growth(physical_devices[0], True)
            print("config augmentation allocation mémoire gpu activée")
        except RuntimeError as e:
            print(e)

setup_gpu()

config augmentation allocation mémoire gpu activée


In [2]:
import keras
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sea
import sklearn
import scipy as sc
import nltk as nltk
import statsmodels as statsmodels
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [3]:
print("Version de TensorFlow :", tf.__version__)

Version de TensorFlow : 2.10.1


In [4]:
print("Version de Keras :", keras.__version__)

Version de Keras : 2.10.0


## Panneau de configuration

In [5]:
project_path = "C:\\Users\\charl\\Documents\\workspace\\green_ia\\cch\\"
file_nbr = '01' # numéro d'identification des csv à générer 
openfoodfact_csv_version = '01'

In [6]:
# import du csv de données openfoodfacts
openfoodfacts_csv = project_path + f"data_global\\openfoodfacts_{openfoodfact_csv_version}.csv"
df_imported = pd.read_csv(openfoodfacts_csv)

  df_imported = pd.read_csv(openfoodfacts_csv)


In [35]:
# cellule à rafraichir pour éviter de charger pendant 1h !
df = df_imported

# Analyse des données 

In [36]:
# affiche le nom de toutes les colonnes du df 
column_list = list(df.columns)
print(f"nbr col: {len(column_list)}, liste: {column_list}")

nbr col: 206, liste: ['code', 'url', 'creator', 'created_t', 'created_datetime', 'last_modified_t', 'last_modified_datetime', 'last_modified_by', 'last_updated_t', 'last_updated_datetime', 'product_name', 'abbreviated_product_name', 'generic_name', 'quantity', 'packaging', 'packaging_tags', 'packaging_en', 'packaging_text', 'brands', 'brands_tags', 'categories', 'categories_tags', 'categories_en', 'origins', 'origins_tags', 'origins_en', 'manufacturing_places', 'manufacturing_places_tags', 'labels', 'labels_tags', 'labels_en', 'emb_codes', 'emb_codes_tags', 'first_packaging_code_geo', 'cities', 'cities_tags', 'purchase_places', 'stores', 'countries', 'countries_tags', 'countries_en', 'ingredients_text', 'ingredients_tags', 'ingredients_analysis_tags', 'allergens', 'allergens_en', 'traces', 'traces_tags', 'traces_en', 'serving_size', 'serving_quantity', 'no_nutrition_data', 'additives_n', 'additives', 'additives_tags', 'additives_en', 'nutriscore_score', 'nutriscore_grade', 'nova_group'

In [37]:
# afficher % de nan / colonne:
total_rows = df.shape[0]
nan_counts = df.isna().sum()
nan_percentage = (nan_counts / total_rows) * 100
nan_df = pd.DataFrame({'column_name': nan_percentage.index, 'percentage_nan': nan_percentage.values})

nan_percent_range = [(0, 10), (10, 20), (20, 30), (30, 40), (40, 50), (50, 60), (60, 70), (70, 80), (80, 90), (90, 100)]
grouped = nan_df.groupby(pd.cut(nan_df['percentage_nan'], bins=[tranche[0] for tranche in nan_percent_range + [(100,)]]))

for tranche, group in grouped:
    print(f"range {tranche}:")
    print(group['column_name'].tolist())
    print()

range (0, 10]:
['creator', 'last_modified_by', 'last_updated_t', 'last_updated_datetime', 'product_name', 'countries', 'countries_tags', 'countries_en', 'nutriscore_grade', 'pnns_groups_1', 'pnns_groups_2', 'states', 'states_tags', 'states_en', 'ecoscore_grade', 'completeness']

range (10, 20]:
['last_image_t', 'last_image_datetime', 'image_url', 'image_small_url']

range (20, 30]:
['energy-kcal_100g', 'energy_100g', 'fat_100g', 'saturated-fat_100g', 'carbohydrates_100g', 'sugars_100g', 'proteins_100g']

range (30, 40]:
['salt_100g', 'sodium_100g']

range (40, 50]:
['brands', 'brands_tags', 'image_nutrition_url', 'image_nutrition_small_url']

range (50, 60]:
['categories', 'categories_tags', 'categories_en', 'main_category', 'main_category_en']

range (60, 70]:
['quantity', 'ingredients_analysis_tags', 'nutriscore_score', 'food_groups', 'food_groups_tags', 'food_groups_en', 'nutrient_levels_tags', 'product_quantity', 'unique_scans_n', 'popularity_tags', 'fiber_100g', 'nutrition-score-f

  grouped = nan_df.groupby(pd.cut(nan_df['percentage_nan'], bins=[tranche[0] for tranche in nan_percent_range + [(100,)]]))


# Traitement des données 

In [38]:
# récupérer les colonnes intéressantes 
col_classi = [
    'pnns_groups_2',
    'energy-kcal_100g',
    'fat_100g',
    'saturated-fat_100g',
    'carbohydrates_100g',
    'sugars_100g',
    'proteins_100g',
    'salt_100g',
    'sodium_100g',
    'brands_tags',
]
df = df[col_classi] 
df.tail(3)

Unnamed: 0,pnns_groups_2,energy-kcal_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,proteins_100g,salt_100g,sodium_100g,brands_tags
3236623,unknown,,,,,,,,,mtr
3236624,Dairy desserts,24.0,28.0,13.0,70.0,49.0,2.0,3.0,1.2,
3236625,unknown,,,,,,,,,edeka


In [39]:
# renommer les colonnes 
rename_col = {
    'energy-kcal_100g': 'kcal', 
    'fat_100g': 'fat',
    'saturated-fat_100g': 'sat_fat',
    'carbohydrates_100g': 'carbohyd',
    'sugars_100g': 'sugar',
    'proteins_100g': 'prot',
    'salt_100g': 'salt',
    'sodium_100g': 'sodium',
    'brands_tags': 'brand',
    'pnns_groups_2': 'pnns2'
}
df = df.rename(columns=rename_col)
df.tail(3)

Unnamed: 0,pnns2,kcal,fat,sat_fat,carbohyd,sugar,prot,salt,sodium,brand
3236623,unknown,,,,,,,,,mtr
3236624,Dairy desserts,24.0,28.0,13.0,70.0,49.0,2.0,3.0,1.2,
3236625,unknown,,,,,,,,,edeka


In [40]:
# PAS SI SIMPLE !!!!!
# remplacer unknow par none pour les colonnes textuelles uniquement
# laisser les nan dans les colonnes numériques 
# remplacer tous les NaN et unkown par null 
df.replace(['unknown', np.nan], ['none', 'none'], inplace=True)
df.tail(3)

Unnamed: 0,pnns2,kcal,fat,sat_fat,carbohyd,sugar,prot,salt,sodium,brand
3236623,none,none,none,none,none,none,none,none,none,mtr
3236624,Dairy desserts,24.0,28.0,13.0,70.0,49.0,2.0,3.0,1.2,none
3236625,none,none,none,none,none,none,none,none,none,edeka


In [41]:
# mélange des lignes aléatoirement 
df = df.sample(frac=1).reset_index(drop=True)
df.tail(3)

Unnamed: 0,pnns2,kcal,fat,sat_fat,carbohyd,sugar,prot,salt,sodium,brand
3236623,Dried fruits,77.0,0.63,0.1,12.2,0.3,3.2,0.02,0.008,carne
3236624,Cereals,443.0,19.95,8.87,57.63,5.17,8.13,3.5275,1.411,myojo-foods-co-ltd
3236625,Vegetables,33.0,0.0,0.0,4.17,3.33,0.83,0.52,0.208,none


In [42]:
# supprime les lignes où ni pnss2 et ni brand ont du contenu 
df = df[~((df['pnns2'] == 'none') & (df['brand'] == 'none'))]
df.tail(3)

Unnamed: 0,pnns2,kcal,fat,sat_fat,carbohyd,sugar,prot,salt,sodium,brand
3236623,Dried fruits,77.0,0.63,0.1,12.2,0.3,3.2,0.02,0.008,carne
3236624,Cereals,443.0,19.95,8.87,57.63,5.17,8.13,3.5275,1.411,myojo-foods-co-ltd
3236625,Vegetables,33.0,0.0,0.0,4.17,3.33,0.83,0.52,0.208,none


In [43]:
# création colonne vaut 0 si manque aucun ingrédient, 1 si en manque 1 sur une ligne 
col_to_check = ['kcal', 'fat', 'sat_fat', 'carbohyd', 'sugar', 'prot', 'salt', 'sodium']
mask = df[col_to_check].apply(lambda x: x.eq('none')).any(axis=1)
df['ingr_miss_val'] = np.where(mask, 1, 0)

# supprime les lignes pour lesquelles pnns2 et ingr_miss_val sont vides ou brand et ingr_miss_val 
df = df[~((df['pnns2'] == 'none') & (df['ingr_miss_val'] == 1) | (df['brand'] == 'none') & (df['ingr_miss_val'] == 1))]

df.tail(3)

Unnamed: 0,pnns2,kcal,fat,sat_fat,carbohyd,sugar,prot,salt,sodium,brand,ingr_miss_val
3236623,Dried fruits,77.0,0.63,0.1,12.2,0.3,3.2,0.02,0.008,carne,0
3236624,Cereals,443.0,19.95,8.87,57.63,5.17,8.13,3.5275,1.411,myojo-foods-co-ltd,0
3236625,Vegetables,33.0,0.0,0.0,4.17,3.33,0.83,0.52,0.208,none,0


In [48]:
# arrondir les valeurs des colonnes numériques 
# puis normaliser ces colonnes 

#df.loc[df['ingr_miss_val'] == 0, 'sugar'] = df.loc[df['ingr_miss_val'] == 0, 'sugar'].round(0)
df.tail(30)
#df= df.drop(columns=['ingr_miss_val'])    

Unnamed: 0,pnns2,kcal,fat,sat_fat,carbohyd,sugar,prot,salt,sodium,brand,ingr_miss_val
3236575,One-dish meals,none,4.8,3.4,7.6,7.4,2.2,1.2,0.48,mechamment-bon,1
3236578,none,301.0,20.0,3.6,20.0,3.1,9.5,1.2,0.48,wojnar-s,0
3236581,none,331.0,4.9,0.5,52.0,2.7,13.0,1.1,0.44,celnat,0
3236583,Meat,116.0,2.3,0.9,0.7,0.0,23.0,3.2,1.28,none,0
3236585,Ice cream,360.0,22.4,10.3,35.3,32.5,4.0,0.17,0.068,hacendado,0
3236589,Cereals,185.0,1.8,0.7,39.0,14.0,9.0,0.6,0.24,none,0
3236592,Biscuits and cakes,552.4,34.8,20.8,50.4,48.0,8.8,0.2,0.08,green-black-s,0
3236594,Processed meat,none,none,none,none,none,none,none,none,panzani,1
3236596,Fish and seafood,100.0,1.0,0.0,0.0,0.0,23.0,0.4,0.16,none,0
3236597,none,278.0,7.6,5.2,31.0,31.0,19.4,1.43,0.572,vantastic-foods,0


In [34]:
# normalisation des données numériques 
#col_to_norm = ['kcal', 'fat', 'sat_fat', 'carbohyd', 'sugar', 'prot', 'salt', 'sodium']
#scaler = MinMaxScaler()
#df[col_to_norm] = scaler.fit_transform(df[col_to_norm])
#df.tail(3)

In [17]:
# arrondir toutes les valeurs numériques, ne garder aucun chiffre après la virgule
#col_to_round = ['kcal', 'fat', 'sat_fat', 'carbohyd', 'sugar', 'prot', 'salt', 'sodium']
#df[col_to_round] = df[col_to_round].round()
#df[col_to_round] = df[col_to_round].astype(int)
#df.tail(30)