In [None]:
import pandas as pd
import numpy as np
import re
from langdetect import detect
from googletrans import Translator

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

# Chemin vers le fichier JSONL
file_path = '/home/carolus/Documents/school/green_ia/data/02_openfoodfacts_sample.jsonl'

# Lire le fichier JSONL en utilisant la méthode read_json de pandas avec l'option lines=True
df = pd.read_json(file_path, lines=True)

# Afficher le DataFrame
print(df)


# main preprocessing:

In [None]:
# traducteur, remplace le contenu d'une autre langue que l'anglais en anglais 
translator = Translator()
def translate_to_english(text):
    if text is None:
        return text
    try:
        detected_lang = detect(text)
        if detected_lang == 'en':
            return text.lower()
        else:
            translated = translator.translate(text, dest='en')
            return translated.text.lower()
    except Exception as e:
        print(f"Erreur lors du traitement: {e}")
        return text
    
df.rename(columns={'pnns_groups_1': 'groups'}, inplace=True)
df.rename(columns={'ingredients_tags': 'ingredients_temp'}, inplace=True)
df.rename(columns={'product_name': 'name'}, inplace=True)
df.rename(columns={'ecoscore_tags': 'ecoscore_groups'}, inplace=True)
df.rename(columns={'categories_tags': 'categories_temp'}, inplace=True)
df.rename(columns={'ecoscore_score': 'ecoscore_note'}, inplace=True)
df.rename(columns={'labels_tags': 'labels_temp'}, inplace=True)
df.rename(columns={'countries': 'countries_temp'}, inplace=True)


# traitement col GROUPS 
df['groups'] = df['groups'].replace("unknown", None, regex=False)  
df['groups'] = df['groups'].apply(translate_to_english)


# traitement col NAME
df['name'] = df['name'].replace("", None)  
df['name'] = df['name'].replace({np.nan: None})
df['name'] = df['name'].apply(translate_to_english)


# traitement col CODE
df['code'] = df['code'].apply(lambda x: np.nan if pd.isna(x) else int(round(x)))
df['code'] = df['code'].replace("", None)  
df['code'] = df['code'].replace({np.nan: None})


# supprime les lignes où le code ean ou le nom produit sont absents 
df = df[df['name'].notna() & df['code'].notna()]

# traitement col INGREDIENTS
df['ingredients_temp'] = df['ingredients_temp'].replace("", None)  # remplace vide par None
df['ingredients_temp'] = df['ingredients_temp'].replace({np.nan: None}) # remplace NaN par None
df['ingredients_temp'] = df['ingredients_temp'].apply(lambda x: x if isinstance(x, list) else []) # remplace None par liste vide 
df['ingredients_temp'] = df['ingredients_temp'].apply(lambda x: ', '.join(x)) # converti liste en string 

# extraire éléments avec 'en:' nouvelle colonne
def extract_en_ingredients(ingredient_list):
    ingredients = ingredient_list.strip('[]').split(', ')
    return [ingredient.split(':')[-1] for ingredient in ingredients if ingredient.startswith('en:')]
df['ingredients'] = df['ingredients_temp'].apply(extract_en_ingredients)
df.drop(columns=['ingredients_temp'], inplace=True)
df['ingredients'] = df['ingredients'].apply(lambda x: ', '.join(x))
df['ingredients'] = df['ingredients'].replace("", None)  


# traitement col PACKAGING
df['packaging'] = df['packaging'].replace("", None)
def remove_two_letters_and_colon(s):
    if isinstance(s, str):
        return re.sub(r'\b\w{2}:\b', '', s)
    return s
df['packaging'] = df['packaging'].apply(remove_two_letters_and_colon)
df['packaging'] = df['packaging'].apply(translate_to_english)


# traitement col ECOSCORE_GROUPS
df['ecoscore_groups'] = df['ecoscore_groups'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x) # conversion liste vers string 
df['ecoscore_groups'] = df['ecoscore_groups'].replace("unknown", "z")
df['ecoscore_groups'] = df['ecoscore_groups'].replace("", "z")
df['ecoscore_groups'] = df['ecoscore_groups'].fillna("z") 
df['ecoscore_groups'] = df['ecoscore_groups'].replace("not-applicable", "z")


# traitment col CATEGORIES
df['categories_temp'] = df['categories_temp'].replace("", None)  
df['categories_temp'] = df['categories_temp'].replace({np.nan: None}) 
df['categories_temp'] = df['categories_temp'].apply(lambda x: x if isinstance(x, list) else [])
df['categories_temp'] = df['categories_temp'].apply(lambda x: ', '.join(x))

# extraire éléments avec 'en:' nouvelle colonne
def extract_en_categories(ingredient_list):
    ingredients = ingredient_list.strip('[]').split(', ')
    return [ingredient.split(':')[-1] for ingredient in ingredients if ingredient.startswith('en:')]
df['categories'] = df['categories_temp'].apply(extract_en_categories)
df.drop(columns=['categories_temp'], inplace=True)
df['categories'] = df['categories'].apply(lambda x: ', '.join(x))
df['categories'] = df['categories'].replace("", None)  
df['categories'] = df['categories'].apply(translate_to_english)


# traitment col COUNTRIES
country_mapping = {
    'fr': 'france',
    'us': 'united states',
    'ca': 'canada',
    'ie': 'ireland',
    'it': 'italy',
    'za': 'south africa',
    'ch': 'switzerland',
    'gb': 'united kingdom',
    'be': 'belgium',
    'no': 'norway',
    'es': 'spain',
    'jp': 'japan', 
    'de': 'germany', 
    've': 'venezuela', 
    'au': 'australia', 
    'dz': 'algeria', 
    'ma': 'morocco', 
    'ro': 'romania', 
    'vg': 'united kingdom', 
    'pf': 'french polynesia', 
    'at': 'austria', 
    'pr': 'puerto rico'
}
df['countries_temp'] = df['countries_temp'].replace("", None)
df['countries_temp'] = df['countries_temp'].replace({np.nan: None})
df['countries_temp'] = df['countries_temp'].apply(lambda x: x if isinstance(x, list) else ([] if x is None else x.split(', ')))

def extract_en_countries(ingredient_list):
    if isinstance(ingredient_list, str):
        ingredient_list = ingredient_list.split(', ')
    return [ingredient.split(':', 1)[-1] for ingredient in ingredient_list if ingredient.startswith('en:')]

df['countries'] = df['countries_temp'].apply(extract_en_countries)
df['countries'] = df['countries'].apply(lambda x: ', '.join(x) if x else None)
df.drop(columns=['countries_temp'], inplace=True)

df['countries'] = df['countries'].str.lower() # convertir en minuscule 
df['countries'] = df['countries'].replace(country_mapping) # remplacer les acronymes grâce au dictionnaire
df['countries'] = df['countries'].fillna('None')


# traitment col ECOSCORE_NOTE
df['ecoscore_note'] = df['ecoscore_note'].replace("unknown", 999)
df['ecoscore_note'] = df['ecoscore_note'].replace("", 999)
df['ecoscore_note'] = df['ecoscore_note'].fillna(999)


# traitment col LABELS
df['labels_temp'] = df['labels_temp'].replace("", None)
df['labels_temp'] = df['labels_temp'].replace({np.nan: None})
df['labels_temp'] = df['labels_temp'].apply(lambda x: x if isinstance(x, list) else ([] if x is None else x.split(', ')))

def extract_en_labels(ingredient_list):
    if isinstance(ingredient_list, str):
        ingredient_list = ingredient_list.split(', ')
    return [ingredient.split(':', 1)[-1] for ingredient in ingredient_list if ingredient.startswith('en:')]

df['labels'] = df['labels_temp'].apply(extract_en_labels)
df['labels'] = df['labels'].apply(lambda x: ', '.join(x) if x else None)
df.drop(columns=['labels_temp'], inplace=True)

def count_commas_plus_one(value):
    if pd.isna(value):  
        return 0
    return value.count(',') + 1
df['labels_note'] = df['labels'].apply(count_commas_plus_one)
df.drop(columns=['labels'], inplace=True)


In [None]:
df.head(70)

In [None]:
valeurs_uniques = df['packaging'].unique()
print(valeurs_uniques)

In [None]:
df['countries'].tail(50)

In [None]:
nan_counts = df.isna().sum()

# Compter les listes vides
empty_list_counts = df.map(lambda x: x == []).sum()

# Afficher les résultats
print("nombre de NaN ou None par colonne :")
print(nan_counts)

In [None]:
print("nombre de listes vides par colonne :")
print(empty_list_counts)

In [None]:
df['labels_note'].head(50)

In [None]:
df['ingredients'].head(50)