In [8]:
import tensorflow as tf

def setup_gpu():
    physical_devices = tf.config.list_physical_devices('GPU')
    if physical_devices:
        try:
            tf.config.experimental.set_memory_growth(physical_devices[0], True)
            print("config augmentation allocation mémoire gpu activée")
        except RuntimeError as e:
            print(e)

setup_gpu()

In [9]:
import keras
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sea
import sklearn
import scipy as sc
import nltk as nltk
import statsmodels as statsmodels
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [10]:
print("Version de TensorFlow :", tf.__version__)

Version de TensorFlow : 2.10.1


In [11]:
print("Version de Keras :", keras.__version__)

Version de Keras : 2.10.0


## Panneau de configuration

In [14]:
projectPath = "/home/carolus/Documents/school/green_ia/data/"
fileNbr = '00' # numéro d'identification des csv à générer et charger

In [16]:
# sauvegarde du csv dans un df
df = pd.read_csv(projectPath + fileNbr + "_OpenFoodFacts_00.csv", sep='\t', low_memory=False)

In [17]:
df.head(5)

Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,last_modified_by,last_updated_t,last_updated_datetime,...,glycemic-index_100g,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g,sulphate_100g,nitrate_100g,acidity_100g
0,225,http://world-en.openfoodfacts.org/product/0000...,nutrinet-sante,1623855208,2021-06-16T14:53:28Z,1692101569,2023-08-15T12:12:49Z,digg,1707748000.0,2024-02-12T14:25:39Z,...,,,,,,,,,,
1,207025004,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1656948610,2022-07-04T15:30:10Z,1656948613,2022-07-04T15:30:13Z,kiliweb,1707864000.0,2024-02-13T22:43:38Z,...,,,,,,,,,,
2,3429145,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1630483911,2021-09-01T08:11:51Z,1682646029,2023-04-28T01:40:29Z,isabel626,1707844000.0,2024-02-13T17:00:47Z,...,,,,,,,,,,
3,26772226,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1654250311,2022-06-03T09:58:31Z,1654270474,2022-06-03T15:34:34Z,quentinbrd,1707742000.0,2024-02-12T12:49:37Z,...,,,,,,,,,,
4,17,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1529059080,2018-06-15T10:38:00Z,1561463718,2019-06-25T11:55:18Z,kiliweb,1707490000.0,2024-02-09T14:47:36Z,...,,,,,,,,,,


In [None]:
# import du csv de données openfoodfacts
openfoodfactsCsv = projectPath + fileNbr + "_OpenFoodFacts.csv"
df = pd.read_csv(openfoodfactsCsv, sep=';' , on_bad_lines='skip')

In [None]:
df.head(5)

# Analyse des données 

In [None]:
# affiche le nom de toutes les colonnes du df 
column_list = list(df.columns)
print(f"nbr col: {len(column_list)}, liste: {column_list}")

In [None]:
# afficher % de nan / colonne:
total_rows = df.shape[0]
nan_counts = df.isna().sum()
nan_percentage = (nan_counts / total_rows) * 100
nan_df = pd.DataFrame({'column_name': nan_percentage.index, 'percentage_nan': nan_percentage.values})

nan_percent_range = [(0, 10), (10, 20), (20, 30), (30, 40), (40, 50), (50, 60), (60, 70), (70, 80), (80, 90), (90, 100)]
grouped = nan_df.groupby(pd.cut(nan_df['percentage_nan'], bins=[tranche[0] for tranche in nan_percent_range + [(100,)]]))

for tranche, group in grouped:
    print(f"range {tranche}:")
    print(group['column_name'].tolist())
    print()

# Traitement des données 

In [None]:
# récupérer les colonnes intéressantes 
col_classi = [
    'product_name',
    'pnns_groups_1',
    'pnns_groups_2',
    'energy-kcal_100g',
    'fat_100g',
    'saturated-fat_100g',
    'carbohydrates_100g',
    'sugars_100g',
    'proteins_100g',
    'salt_100g',
    'sodium_100g',
    'brands_tags',
    'image_url'
]
cat_df = df[col_classi] 
cat_df.tail(3)

In [None]:
# renommer les colonnes 
rename_col = {
    'pnns_groups_1': 'pnns1',
    'energy-kcal_100g': 'kcal', 
    'fat_100g': 'fat',
    'saturated-fat_100g': 'sat_fat',
    'carbohydrates_100g': 'carbohyd',
    'sugars_100g': 'sugar',
    'proteins_100g': 'prot',
    'salt_100g': 'salt',
    'sodium_100g': 'sodium',
    'brands_tags': 'brand',
    'pnns_groups_2': 'pnns2',
    'image_url': 'img_url',
    'product_name': 'name',
}
cat_df = cat_df.rename(columns=rename_col)
cat_df.tail(3)

In [None]:
# suppresion des lignes où pnns2 = unknow ou NaN
cat_df = cat_df.dropna(subset = ["pnns2"])
cat_df = cat_df[cat_df['pnns2'] != 'unknown']

# mélange des lignes aléatoirement 
cat_df = cat_df.sample(frac=1).reset_index(drop=True)

cat_df.tail(3)

In [None]:
# afficher les lignes pour les quelles url img et nom produit ne sont pas vides au même moment  
result = cat_df[cat_df['img_url'].isna() & cat_df['name'].notna()]
result.head(5)

In [None]:
# normalisation des données numériques 
# supprimer toutes les valeurs < 0 et > 100
col_to_norm = ['kcal', 'fat', 'sat_fat', 'carbohyd', 'sugar', 'prot', 'salt', 'sodium']
scaler = MinMaxScaler()
cat_df[col_to_norm] = scaler.fit_transform(cat_df[col_to_norm])
cat_df.tail(3)

# Génération df predict catégories

In [None]:
# split des df de train, test et valid
cat_train_df, cat_test_df = train_test_split(cat_df, test_size=0.1, random_state=42)
cat_train_df, cat_valid_df = train_test_split(cat_train_df, test_size=0.05, random_state=42) 

# sauvegarde au format csv 
cat_train_df.to_csv(project_path + f'data/cat_train_df_{file_nbr}.csv', index=False, sep=";")
cat_test_df.to_csv(project_path + f'data/cat_test_df_{file_nbr}.csv', index=False, sep=";")
cat_valid_df.to_csv(project_path + f'data/cat_valid_df_{file_nbr}.csv', index=False, sep=";")