In [58]:
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

# Chemin vers le fichier JSONL
file_path = '/home/carolus/Documents/school/green_ia/data/02_openfoodfacts_sample.jsonl'

# Lire le fichier JSONL en utilisant la méthode read_json de pandas avec l'option lines=True
df = pd.read_json(file_path, lines=True)

# Afficher le DataFrame
print(df)


             pnns_groups_1  \
0                  unknown   
1           Fat and sauces   
2                  unknown   
3                  unknown   
4    Fruits and vegetables   
..                     ...   
995          Sugary snacks   
996                unknown   
997   Cereals and potatoes   
998                unknown   
999                unknown   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [59]:
# main preprocessing

df.rename(columns={'pnns_groups_1': 'groups'}, inplace=True)
df.rename(columns={'ingredients_tags': 'ingredients_temp'}, inplace=True)
df.rename(columns={'product_name': 'name'}, inplace=True)
df.rename(columns={'ecoscore_tags': 'ecoscore_groups'}, inplace=True)
df.rename(columns={'categories_tags': 'categories'}, inplace=True)
df.rename(columns={'ecoscore_score': 'ecoscore_note'}, inplace=True)
df.rename(columns={'labels_tags': 'labels'}, inplace=True)

# traitement col GROUPS 
df['groups'] = df['groups'].replace("unknown", None, regex=False)  


# traitement col INGREDIENTS
df['ingredients_temp'] = df['ingredients_temp'].replace("", None)  # remplace vide par None
df['ingredients_temp'] = df['ingredients_temp'].replace({np.nan: None}) # remplace NaN par None

df['ingredients_temp'] = df['ingredients_temp'].apply(lambda x: x if isinstance(x, list) else []) # remplace None par liste vide 
df['ingredients_temp'] = df['ingredients_temp'].apply(lambda x: ', '.join(x)) # converti liste en string 

# extraire éléments avec 'en:' nouvelle colonne
def extract_en_ingredients(ingredient_list):
    ingredients = ingredient_list.strip('[]').split(', ')
    return [ingredient.split(':')[-1] for ingredient in ingredients if ingredient.startswith('en:')]
df['ingredients'] = df['ingredients_temp'].apply(extract_en_ingredients)
df.drop(columns=['ingredients_temp'], inplace=True)
df['ingredients'] = df['ingredients'].apply(lambda x: ', '.join(x))
df['ingredients'] = df['ingredients'].replace("", None)  

# traitement col PACKAGING
df['packaging'] = df['packaging'].replace("", None)
def remove_two_letters_and_colon(s):
    if isinstance(s, str):
        return re.sub(r'\b\w{2}:\b', '', s)
    return s
df['packaging'] = df['packaging'].apply(remove_two_letters_and_colon)


In [60]:
df.tail(70)

Unnamed: 0,groups,packaging,name,ecoscore_groups,categories,ecoscore_note,labels,countries,ingredients
930,,,Cheese & Caramel Mix,[unknown],,,,en:us,
931,Composite foods,,Tortelli Toscani,[unknown],"[en:meals, en:pasta-dishes, en:stuffed-pastas]",,,en:it,
932,,,Maquereaux 3 epices,[unknown],,,,en:fr,
933,Composite foods,,La gourmande Chèvre,[b],"[en:meals, en:pizzas-pies-and-quiches, en:pizzas, en:vegetarian-pizzas, en:cheese-pizzas]",69.0,[],France,"wheat-flour, cereal, flour, wheat, cereal-flour, emmental, dairy, cheese, water, goat-cheese, tomato-puree, vegetable, fruit-vegetable, tomato, black-olive, olive, extra-virgin-olive-oil, oil-and-fat, vegetable-oil-and-fat, vegetable-oil, olive-oil, virgin-olive-oil, salt, yeast, onion, root-vegetable, onion-family-vegetable, sugar, added-sugar, disaccharide, garlic, pepper, seed"
934,Sugary snacks,"Glass jar, Steel cap",Ziedu medus,[a],"[en:breakfasts, en:spreads, en:sweet-spreads, en:bee-products, en:farming-products, en:sweeteners, en:honeys]",97.0,,en:Latvia,
935,Salty snacks,"Box, Hdpe film-bag",Wholegrain crakers,[b],"[en:snacks, en:salty-snacks, en:appetizers, en:crackers]",61.0,[en:no-gluten],United Kingdom,"wholemeal-oat, cereal, oat, palm-oil, oil-and-fat, vegetable-oil-and-fat, palm-oil-and-fat, corn-starch, starch, sea-salt, salt, raising-agent, honey, added-sugar, e503ii, e503"
936,,,Tropical Punch,[unknown],,,,en:us,
937,,,Mexican Pure Vanilla Bean Paste,[unknown],,,"[en:no-gmos, en:non-gmo-project]",World,
938,Fruits and vegetables,,,[b],"[en:plant-based-foods-and-beverages, en:plant-based-foods, en:fruits-and-vegetables-based-foods, en:desserts, en:fruits-based-foods, en:compotes]",76.0,"[en:no-preservatives, en:no-colorings]",en:France,
939,Milk and dairy products,,Mozzarella,[c],"[en:dairies, en:fermented-foods, en:fermented-milk-products, en:cheeses, en:italian-cheeses, en:stretched-curd-cheeses, en:mozzarella]",55.0,,en:it,


In [61]:
df['ingredients'].head(10)

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                None
1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                N

In [62]:
nan_counts = df.isna().sum()

# Compter les listes vides
empty_list_counts = df.map(lambda x: x == []).sum()

# Afficher les résultats
print("nombre de NaN ou None par colonne :")
print(nan_counts)

nombre de NaN ou None par colonne :
groups             646
packaging          885
name                32
ecoscore_groups      3
categories         513
ecoscore_note      749
labels             581
countries            5
ingredients        716
dtype: int64


In [63]:
print("nombre de listes vides par colonne :")
print(empty_list_counts)

nombre de listes vides par colonne :
groups               0
packaging            0
name                 0
ecoscore_groups      0
categories          39
ecoscore_note        0
labels             136
countries            0
ingredients          0
dtype: int64


In [64]:
df['labels'].head(50)

0                                                                                                                                                                                             None
1                                                                                                                                                                                             None
2                                                                                                                                                                                             None
3                                                                                                                                                                                             None
4                                                                                                                                                                                             None
5                        

In [65]:
df['ingredients'].head(50)

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              None
1                                                                                                                                                                                                                                                                                                   