# Génération d'un fichier csv 
### Ce script doit générer un fichier csv contenant des données propres, qui seront par la suite retravaillées pour faire de la prédictions d'ingrédients ou de catégorie dans l'objectif de prédire l'écoscore des produits. 

In [40]:
import tensorflow as tf

def setup_gpu():
    physical_devices = tf.config.list_physical_devices('GPU')
    if physical_devices:
        try:
            tf.config.experimental.set_memory_growth(physical_devices[0], True)
            print("config augmentation allocation mémoire gpu activée")
        except RuntimeError as e:
            print(e)

setup_gpu()

config augmentation allocation mémoire gpu activée


In [41]:
import keras
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sea
import sklearn
import scipy as sc
import nltk as nltk
import statsmodels as statsmodels
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [42]:
print("Version de TensorFlow :", tf.__version__)

Version de TensorFlow : 2.10.1


In [43]:
print("Version de Keras :", keras.__version__)

Version de Keras : 2.10.0


## Panneau de configuration

In [44]:
project_path = "C:\\Users\\charl\\Documents\\workspace\\green_ia\\cch\\"
file_nbr = '01' # numéro d'identification des csv à générer 
openfoodfact_csv_version = '01'

In [45]:
# import du csv de données openfoodfacts
openfoodfacts_csv = project_path + f"data_global\\openfoodfacts_{openfoodfact_csv_version}.csv"
df_imported = pd.read_csv(openfoodfacts_csv)

  df_imported = pd.read_csv(openfoodfacts_csv)


In [169]:
# cellule à rafraichir pour éviter de charger pendant 1h !
df = df_imported

# Analyse des données 

In [170]:
# affiche le nom de toutes les colonnes du df 
column_list = list(df.columns)
print(f"nbr col: {len(column_list)}, liste: {column_list}")

nbr col: 206, liste: ['code', 'url', 'creator', 'created_t', 'created_datetime', 'last_modified_t', 'last_modified_datetime', 'last_modified_by', 'last_updated_t', 'last_updated_datetime', 'product_name', 'abbreviated_product_name', 'generic_name', 'quantity', 'packaging', 'packaging_tags', 'packaging_en', 'packaging_text', 'brands', 'brands_tags', 'categories', 'categories_tags', 'categories_en', 'origins', 'origins_tags', 'origins_en', 'manufacturing_places', 'manufacturing_places_tags', 'labels', 'labels_tags', 'labels_en', 'emb_codes', 'emb_codes_tags', 'first_packaging_code_geo', 'cities', 'cities_tags', 'purchase_places', 'stores', 'countries', 'countries_tags', 'countries_en', 'ingredients_text', 'ingredients_tags', 'ingredients_analysis_tags', 'allergens', 'allergens_en', 'traces', 'traces_tags', 'traces_en', 'serving_size', 'serving_quantity', 'no_nutrition_data', 'additives_n', 'additives', 'additives_tags', 'additives_en', 'nutriscore_score', 'nutriscore_grade', 'nova_group'

In [171]:
# afficher % de nan / colonne:
total_rows = df.shape[0]
nan_counts = df.isna().sum()
nan_percentage = (nan_counts / total_rows) * 100
nan_df = pd.DataFrame({'column_name': nan_percentage.index, 'percentage_nan': nan_percentage.values})

nan_percent_range = [(0, 10), (10, 20), (20, 30), (30, 40), (40, 50), (50, 60), (60, 70), (70, 80), (80, 90), (90, 100)]
grouped = nan_df.groupby(pd.cut(nan_df['percentage_nan'], bins=[tranche[0] for tranche in nan_percent_range + [(100,)]]))

for tranche, group in grouped:
    print(f"range {tranche}:")
    print(group['column_name'].tolist())
    print()

range (0, 10]:
['creator', 'last_modified_by', 'last_updated_t', 'last_updated_datetime', 'product_name', 'countries', 'countries_tags', 'countries_en', 'nutriscore_grade', 'pnns_groups_1', 'pnns_groups_2', 'states', 'states_tags', 'states_en', 'ecoscore_grade', 'completeness']

range (10, 20]:
['last_image_t', 'last_image_datetime', 'image_url', 'image_small_url']

range (20, 30]:
['energy-kcal_100g', 'energy_100g', 'fat_100g', 'saturated-fat_100g', 'carbohydrates_100g', 'sugars_100g', 'proteins_100g']

range (30, 40]:
['salt_100g', 'sodium_100g']

range (40, 50]:
['brands', 'brands_tags', 'image_nutrition_url', 'image_nutrition_small_url']

range (50, 60]:
['categories', 'categories_tags', 'categories_en', 'main_category', 'main_category_en']

range (60, 70]:
['quantity', 'ingredients_analysis_tags', 'nutriscore_score', 'food_groups', 'food_groups_tags', 'food_groups_en', 'nutrient_levels_tags', 'product_quantity', 'unique_scans_n', 'popularity_tags', 'fiber_100g', 'nutrition-score-f

  grouped = nan_df.groupby(pd.cut(nan_df['percentage_nan'], bins=[tranche[0] for tranche in nan_percent_range + [(100,)]]))


# Traitement des données 

In [172]:
# récupérer les colonnes intéressantes 
col_classi = [
    'pnns_groups_2',
    'energy-kcal_100g',
    'fat_100g',
    'saturated-fat_100g',
    'carbohydrates_100g',
    'sugars_100g',
    'proteins_100g',
    'salt_100g',
    'sodium_100g',
    'brands_tags',
]
df = df[col_classi] 
df.tail(3)

Unnamed: 0,pnns_groups_2,energy-kcal_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,proteins_100g,salt_100g,sodium_100g,brands_tags
3236623,unknown,,,,,,,,,mtr
3236624,Dairy desserts,24.0,28.0,13.0,70.0,49.0,2.0,3.0,1.2,
3236625,unknown,,,,,,,,,edeka


In [173]:
# renommer les colonnes 
rename_col = {
    'energy-kcal_100g': 'kcal', 
    'fat_100g': 'fat',
    'saturated-fat_100g': 'sat_fat',
    'carbohydrates_100g': 'carbohyd',
    'sugars_100g': 'sugar',
    'proteins_100g': 'prot',
    'salt_100g': 'salt',
    'sodium_100g': 'sodium',
    'brands_tags': 'brand',
    'pnns_groups_2': 'pnns2'
}
df = df.rename(columns=rename_col)
df.tail(3)

Unnamed: 0,pnns2,kcal,fat,sat_fat,carbohyd,sugar,prot,salt,sodium,brand
3236623,unknown,,,,,,,,,mtr
3236624,Dairy desserts,24.0,28.0,13.0,70.0,49.0,2.0,3.0,1.2,
3236625,unknown,,,,,,,,,edeka


In [174]:
# remplacer tous les NaN et unkown par null 
df.replace(['unknown', np.nan], ['none', 'none'], inplace=True)
df.tail(3)

Unnamed: 0,pnns2,kcal,fat,sat_fat,carbohyd,sugar,prot,salt,sodium,brand
3236623,none,none,none,none,none,none,none,none,none,mtr
3236624,Dairy desserts,24.0,28.0,13.0,70.0,49.0,2.0,3.0,1.2,none
3236625,none,none,none,none,none,none,none,none,none,edeka


In [175]:
# mélange des lignes aléatoirement 
df = df.sample(frac=1).reset_index(drop=True)
df.tail(3)

Unnamed: 0,pnns2,kcal,fat,sat_fat,carbohyd,sugar,prot,salt,sodium,brand
3236623,Alcoholic beverages,none,none,none,none,none,none,0.0,0.0,none
3236624,none,33.0,0.0,0.0,8.1,7.8,0.1,0.0,0.0,none
3236625,none,127.0,11.53,1.98,8.3,0.66,0.82,0.2,0.08,none


In [183]:
# supprime les lignes où ni pnss2 et ni brand ont du contenu 
df['pnns2_valid'] = df['pnns2'].apply(lambda x: 1 if x == 'none' else 0)
df['brand_valid'] = df['brand'].apply(lambda x: 1 if x == 'none' else 0)
df = df[~((df['pnns2_valid'] == 1) & (df['brand_valid'] == 1))]
df = df.drop(columns=['pnns2_valid', 'brand_valid'])
df.head(30)

Unnamed: 0,pnns2,kcal,fat,sat_fat,carbohyd,sugar,prot,salt,sodium,brand
0,Appetizers,500.0,26.6667,10.0,63.3333,0.0,3.3333,0.208333,0.083333,none
1,Vegetables,21.0,0.0,0.0,4.13,2.48,0.83,0.455,0.182,none
2,none,none,none,none,none,none,none,none,none,pingo-doce
3,Processed meat,351.0,28.0,11.0,9.0,0.0,24.0,3.9,1.56,rapelli
4,Waters and flavored waters,none,none,none,none,none,none,none,none,"san-pellegrino,nestle"
8,none,259.0,12.94,1.76,30.59,1.18,7.06,1.4125,0.565,texas-tamale-company
9,Sweets,543.0,32.0,5.5,56.0,54.0,5.9,0.13,0.052,monoprix
11,none,none,none,none,none,none,none,none,none,aldi
13,none,none,none,none,none,none,none,none,none,schauma
16,Cereals,593.0,50.0,7.0,12.0,0.5,18.0,none,none,spar


In [180]:
#test_df['ingr_val'] = test_df[['kcal', 'fat', 'sat_fat', 'carbohyd', 'sugar', 'prot', 'salt', 'sodium']].isnull().any(axis=1).astype(int)


In [177]:
# normalisation des données numériques 
#col_to_norm = ['kcal', 'fat', 'sat_fat', 'carbohyd', 'sugar', 'prot', 'salt', 'sodium']
#scaler = MinMaxScaler()
#df[col_to_norm] = scaler.fit_transform(df[col_to_norm])
#df.tail(3)

In [178]:
# arrondir toutes les valeurs numériques, ne garder aucun chiffre après la virgule
#col_to_round = ['kcal', 'fat', 'sat_fat', 'carbohyd', 'sugar', 'prot', 'salt', 'sodium']
#df[col_to_round] = df[col_to_round].round()
#df[col_to_round] = df[col_to_round].astype(int)
#df.tail(30)