In [None]:
import pandas as pd
import numpy as np
import json
import os
import warnings
from datetime import datetime 
from collections import OrderedDict
from collections import Counter
import plotly.express as px
from collections import defaultdict
import plotly.graph_objects as go


pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [None]:
chunk_size = 1000
file_id = '02'
project_path = "/home/carolus/Documents/school/green_ia/" 
jsonl_02 = project_path + 'data/' + file_id + '_openfoodfacts_02.jsonl' 
jsonl_sample = project_path + 'data/' + file_id + "_openfoodfacts_sample.jsonl"
# récupérer la date du jour 
current_date_time = datetime.now()
date_format = "%d/%m/%Y %H:%M:%S.%f"
start_date = current_date_time.strftime("%d/%m/%Y %H:%M:%S.%f")[:-3]
date_code = current_date_time.strftime('%d%m%Y%H%M%S') + f"{current_date_time.microsecond // 1000:03d}"

In [None]:
def add_logs(logData):
    print(logData)
    #with open(f"{project_path}logs/03_analysis_{date_code}_logs.txt", "a") as logFile:
     #   logFile.write(f'{logData}\n')

In [24]:
# verifie la validité de la structure du fichier jsonl
with open(jsonl_02, 'r') as file:
    for line in file:
        try:
            json_object = json.loads(line)
        except json.JSONDecodeError as e:
            add_logs(f"ERROR decoding jsonl: {e}")

add_logs(f"jsonl format valid: {jsonl_02}")

jsonl format valid: /home/carolus/Documents/school/green_ia/data/02_openfoodfacts_02.jsonl


# COUNTRIES:

In [25]:
# retourne une liste des pays présents dans le fichier
def extract_countries_from_jsonl(file_path):
    countries = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                country = record.get('countries')
                if country is not None:
                    countries.append(country)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")    
    return countries
countries_list = extract_countries_from_jsonl(jsonl_02)

separated_countries = []
for entry in countries_list:
    countries = [country.strip() for country in entry.split(',')]
    separated_countries.extend(countries)

country_counts = Counter(separated_countries)
total_countries = sum(country_counts.values())

data_graph_countries = []
for country, count in country_counts.items():
    percentage = (count / total_countries) * 100
    print(f"{country}: {percentage:.2f}%")
    data_graph_countries.append({'countries': country, 'percentage': percentage})

none: 10.59%
france: 30.22%
united states: 18.76%
world: 1.59%
bolivia: 0.02%
mexico: 0.10%
new zealand: 0.02%
belgium: 2.22%
danemark: 0.00%
norway: 0.07%
poland: 0.16%
austria: 0.02%
hungary: 0.02%
czech republic: 0.03%
thailand: 0.03%
egypt: 0.01%
japan: 0.18%
croatia: 0.00%
europe: 0.01%
luxembourg: 0.00%
argentina: 0.01%
morocco: 0.02%
spain: 8.81%
costa rica: 0.00%
moldavia: 0.00%
sweden: 0.10%
russia: 0.25%
netherlands: 0.37%
finland: 0.04%
united kingdom: 3.17%
saudi arabia: 0.04%
lebanon: 0.00%
philippines: 0.04%
malaysia: 0.02%
guyana: 0.00%
: 0.09%
barbados: 0.00%
french polynesia: 0.00%
brazil: 0.34%
haiti: 0.00%
salvador: 0.00%
bahrain: 0.00%
guatemala: 0.00%
colombia: 0.00%
oman: 0.00%
dominican republic: 0.00%
greece: 0.01%
kuwait: 0.00%
hong kong: 0.00%
puerto rico: 0.00%
united arab emirates: 0.01%
ireland: 1.55%
singapore: 0.04%
panama: 0.01%
germany: 7.67%
italy: 6.78%
angola: 0.00%
canada: 2.42%
taiwan: 0.00%
burkina faso: 0.00%
vietnam: 0.00%
israel: 0.00%
china: 0

In [26]:
df = pd.DataFrame(data_graph_countries)
fig = px.treemap(df, path=['countries'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='countries treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# ECOSCORE GRAD:

In [27]:
# retourne une liste des notes écoscores présentes dans le fichier
def extract_grad_ecoscore_from_jsonl(file_path):
    ecoscore_grad_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                ecoscore_grad = record.get('ecoscore_note')
                if ecoscore_grad is not None:
                    ecoscore_grad_list.append(ecoscore_grad)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")
    return ecoscore_grad_list
ecoscore_grad_list = extract_grad_ecoscore_from_jsonl(jsonl_02)

ecoscore_grad_counts = Counter(ecoscore_grad_list)
total_ecoscore_grads = sum(ecoscore_grad_counts.values())
data_graph_ecoscore_grad = []
for ecoscore_grad, count in ecoscore_grad_counts.items():
    percentage = (count / total_ecoscore_grads) * 100
    print(f"{ecoscore_grad}: {percentage:.2f}%")
    data_graph_ecoscore_grad.append({'ecoscore_grad': ecoscore_grad, 'percentage': percentage})

999.0: 74.62%
54.0: 0.35%
75.0: 0.48%
76.0: 0.38%
77.0: 0.51%
52.0: 0.17%
9.0: 0.14%
24.0: 0.15%
50.0: 0.43%
65.0: 0.16%
49.0: 0.65%
70.0: 0.29%
79.0: 1.84%
63.0: 0.22%
29.0: 0.21%
21.0: 0.50%
56.0: 0.17%
92.0: 0.04%
34.0: 0.92%
59.0: 0.21%
55.0: 0.23%
73.0: 0.22%
27.0: 0.45%
19.0: 0.32%
60.0: 0.38%
23.0: 0.17%
47.0: 0.21%
42.0: 0.46%
0.0: 0.62%
41.0: 0.17%
67.0: 1.32%
71.0: 0.27%
74.0: 0.31%
30.0: 0.27%
15.0: 0.43%
58.0: 0.26%
61.0: 0.15%
39.0: 0.51%
91.0: 0.07%
53.0: 0.19%
44.0: 0.38%
14.0: 0.10%
72.0: 0.99%
68.0: 0.28%
12.0: 0.33%
18.0: 0.20%
5.0: 0.10%
51.0: 0.22%
66.0: 0.22%
69.0: 0.18%
43.0: 0.39%
8.0: 0.10%
2.0: 0.12%
22.0: 0.26%
46.0: 0.13%
38.0: 0.31%
26.0: 0.13%
10.0: 0.04%
36.0: 0.36%
37.0: 0.39%
78.0: 0.36%
25.0: 0.34%
64.0: 0.24%
4.0: 0.19%
40.0: 0.20%
35.0: 0.31%
20.0: 0.11%
32.0: 0.20%
85.0: 0.08%
62.0: 0.22%
31.0: 0.16%
28.0: 0.19%
89.0: 0.04%
48.0: 0.21%
83.0: 0.06%
16.0: 0.27%
90.0: 0.05%
57.0: 0.24%
45.0: 0.33%
100.0: 0.31%
33.0: 0.12%
82.0: 0.06%
13.0: 0.07%
81.0: 0

In [28]:
df = pd.DataFrame(data_graph_ecoscore_grad)
fig = px.treemap(df, path=['ecoscore_grad'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='ecoscore grad treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# ECOSCORE GROUPS:

In [29]:
# retourne une liste des lettres écoscore présentes dans le fichier
def extract_groups_ecoscore_from_jsonl(file_path):
    ecoscore_groups_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                ecoscore_groups = record.get('ecoscore_groups')
                if ecoscore_groups is not None:
                    ecoscore_groups_list.append(ecoscore_groups)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")
    return ecoscore_groups_list
ecoscore_groups_list = extract_groups_ecoscore_from_jsonl(jsonl_02)

ecoscore_groups_counts = Counter(ecoscore_groups_list)
total_ecoscore_groups = sum(ecoscore_groups_counts.values())

data_graph_ecoscore_groups = []
for ecoscore_group, count in ecoscore_groups_counts.items():
    percentage = (count / total_ecoscore_groups) * 100
    print(f"{ecoscore_group}: {percentage:.2f}%")
    data_graph_ecoscore_groups.append({'ecoscore_group': ecoscore_group, 'percentage': percentage})


z: 74.62%
c: 5.59%
b: 9.01%
e: 3.51%
d: 6.06%
a: 1.21%


In [30]:
df = pd.DataFrame(data_graph_ecoscore_groups)
fig = px.treemap(df, path=['ecoscore_group'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='ecoscore groups treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# LABELS:

In [31]:
# retourne une liste des labels présents dans le fichier (sans doublons dans l'affichage)
def extract_labels_from_jsonl(file_path):
    labels_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                labels = record.get('labels_note')
                if labels is not None:
                    labels_list.append(labels)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")
    return labels_list

labels_list = extract_labels_from_jsonl(jsonl_02)
label_counts = Counter(labels_list)
total_labels = sum(label_counts.values())

data_graph_labels = []
for label, count in label_counts.items():
    percentage = (count / total_labels) * 100
    print(f"{label}: {percentage:.2f}%")
    data_graph_labels.append({'labels': label, 'percentage': percentage})

0: 70.95%
6: 0.88%
1: 11.32%
5: 1.11%
2: 8.13%
3: 4.34%
4: 2.05%
8: 0.29%
7: 0.47%
9: 0.46%


In [32]:
df = pd.DataFrame(data_graph_labels)
fig = px.treemap(df, path=['labels'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='labels treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# NONE NUMBER:

In [33]:
def count_none_and_total_values(jsonl_file_path):
    none_counts = defaultdict(int)
    total_counts = defaultdict(int)
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            for key, value in data.items():
                total_counts[key] += 1
                if value is None:
                    none_counts[key] += 1
    return none_counts, total_counts

def calculate_percentage(none_counts, total_counts):
    percentages = {}
    for key in none_counts:
        if total_counts[key] > 0:
            percentage = (none_counts[key] / total_counts[key]) * 100
        else:
            percentage = 0
        percentages[key] = percentage
    return percentages

none_counts, total_counts = count_none_and_total_values(jsonl_02)
percentages = calculate_percentage(none_counts, total_counts)

In [34]:
data_graph_none = [{'key': key, 'percentage': percentage} for key, percentage in percentages.items()]
df = pd.DataFrame(data_graph_none)
fig = px.treemap(df, path=['key'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='none treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# ECOSCORE NULL:

In [35]:
def count_specific_values(jsonl_file_path):
    counts = {
        'ecoscore_groups': {'z': 0},
        'ecoscore_note': {999: 0}
    }
    total_counts = {
        'ecoscore_groups': 0,
        'ecoscore_note': 0
    }
    with open(jsonl_file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            for key, value in data.items():
                if key in counts:
                    total_counts[key] += 1
                    if key == 'ecoscore_groups' and value == 'z':
                        counts[key]['z'] += 1
                    if key == 'ecoscore_note' and value == 999:
                        counts[key][999] += 1
    return counts, total_counts

def calculate_percentage(count, total):
    if total > 0:
        percentage = (count / total) * 100
    else:
        percentage = 0
    return percentage

counts, total_counts = count_specific_values(jsonl_02)

z_percentage = calculate_percentage(counts['ecoscore_groups']['z'], total_counts['ecoscore_groups'])
number_999 = counts['ecoscore_note'][999]
number_999_percentage = calculate_percentage(number_999, total_counts['ecoscore_note'])

labels = ['Ecoscore Groups (z)', 'Ecoscore Note (999)']
values = [z_percentage, number_999_percentage]
counts_values = [counts['ecoscore_groups']['z'], counts['ecoscore_note'][999]]

In [36]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=labels,
    y=values,
    name='Pourcentage',
    text=[f'{v:.2f}%' for v in values],
    textposition='auto'
))
fig.add_trace(go.Bar(
    x=labels,
    y=counts_values,
    name='Nombre',
    text=[str(v) for v in counts_values],
    textposition='auto'
))
fig.update_layout(
    title='percentage empty data ecoscore group and grad',
    xaxis_title='Catégories',
    yaxis_title='Valeurs',
    barmode='group'
)
fig.show()

# TOTAL ARTICLES NUMBER:

In [37]:
print(f"total product number: {total_labels}")

total product number: 3177829


# JSONL SAMPLE:

In [38]:
df = pd.read_json(jsonl_sample, lines=True)
df.head(60)

Unnamed: 0,groups,packaging,name,ecoscore_groups,ecoscore_note,code,countries,ingredients,categories,labels_note
0,,none,"kroger, potato salad",z,999,11110991881,united states,"potato, vegetable, root-vegetable, tuber, mayonnaise, sauce, water, mustard, onion, onion-family-vegetable, sugar, added-sugar, disaccharide, contains-2-and-less-of, carrot, taproot-vegetable, bell-pepper, fruit-vegetable, celery, stalk-vegetable, salt, soya-oil, oil-and-fat, vegetable-oil-and-fat, vegetable-oil, wheat-germ, cereal, wheat, high-fructose-corn-syrup, monosaccharide, fructose, glucose, corn-syrup, glucose-fructose-syrup, e301, e300, e415, lactic-and-acetic-acids, e330, e435, natural-flavouring, flavouring, e325, spice, condiment, e160ai, e160a, turmeric, e385, e202, soybean-oil-vinegar, egg-yolk, egg, distilled-vinegar, vinegar, mustard-seed, colour, preservative",salted-snacks,0
1,milk and dairy products,none,"crystal farms, fat free american cheese",d,34,75925286006,united states,"skim-milk-cheese, water, whey, dairy, milk-protein-concentrate-buttermilk, sodium-citrate, minerals, sodium, sugar, added-sugar, disaccharide, maltodextrin, salt, e200, e407, annatto-and-b-apo-8-carotenal, pasteurized-skim-milk-cheese-culture, enzyme, preservative, colour","dairies, fermented-foods, fermented-milk-products, cheeses",0
2,fruits and vegetables,none,cut corn,z,999,78742110301,united states,"corn, cereal","plant-based-foods-and-beverages, plant-based-foods, fruits-and-vegetables-based-foods, vegetables-based-foods, frozen-foods, frozen-plant-based-foods, frozen-vegetables",0
3,cereals and potatoes,none,"enriched macaroni product, fettuccini pasta",z,999,92825104957,united states,"durum-wheat-semolina, cereal, wheat, durum-wheat, semolina, iron, minerals, b-vitamins, e375, thiamin-mononitrate, thiamin, e101, folic-acid, folate, ferrous-sulfate","plant-based-foods-and-beverages, plant-based-foods, cereals-and-potatoes, cereals-and-their-products, pastas",0
4,fish meat eggs,sous-vide,barbecue ribs original,e,7,20639037,france,"intercostal-meat-from-pork-loin, salt, sugar, added-sugar, disaccharide, sunflower-oil, oil-and-fat, vegetable-oil-and-fat, vegetable-oil, spice, condiment, dehydrated-aromatic-plants, smoke-flavouring, flavouring, natural-flavouring, possible-mustard-traces","meats-and-their-products, beef-and-its-products, meats, beef, pork-and-its-products, pork, beef-short-ribs",2
5,fruits and vegetables,"plastique, bouchon de bouteille, film, flacon, bouchon de bouteille de vin",ail semoule,a,80,20864828,france,"dried-garlic, vegetable, root-vegetable, onion-family-vegetable, garlic","plant-based-foods-and-beverages, plant-based-foods, fruits-and-vegetables-based-foods, condiments, vegetables-based-foods, culinary-plants, dried-products, dried-plant-based-foods, garlic-and-their-products, garlic, ground-dried-vegetables, garlic-powder, groceries",1
6,,none,le caramel poire pointe de sel,z,999,3291810769967,france,"sugar, added-sugar, disaccharide, cream, dairy, pear, fruit, water, egg-white, egg, e150, wheat-flour, cereal, flour, wheat, cereal-flour, glucose-syrup, monosaccharide, glucose, whole-milk-powder, milk-powder, modified-starch, starch, gelling-agent, milk-chocolate, chocolate, butter, stabiliser, non-hydrogenated-sunflower-oil, oil-and-fat, vegetable-oil-and-fat, vegetable-oil, sunflower-oil, colour, glucose-fructose-syrup, fructose, emulsifier, e407, e160c, e330, soya-lecithin, e322, e322i",,0
7,,none,mousseline de carottes a la crème fraîche et au beurre,z,999,3700428484235,france,,,0
8,cereals and potatoes,none,extra original,c,59,5053827187466,"france, none, none","whole-grain-oat-flakes, cereal, oat, oat-flakes, sugar, added-sugar, disaccharide, palm-oil, oil-and-fat, vegetable-oil-and-fat, palm-oil-and-fat, wheat-flour, flour, wheat, cereal-flour, desiccated-coconut, fruit, coconut, molasses, salt, barley-malt-extract, malt, malted-barley, cinnamon, condiment, spice","plant-based-foods-and-beverages, plant-based-foods, breakfasts, cereals-and-potatoes, cereals-and-their-products, breakfast-cereals, mueslis",3
9,sugary snacks,none,amandes pralinés gianduja,d,22,3489811229126,france,,"snacks, sweet-snacks, cocoa-and-its-products, confectioneries, chocolate-candies, bonbons, chocolate-covered-nuts, chocolate-covered-almonds",0


In [39]:
def display_first_n_lines_to_dataframe(jsonl_02, num_lines):
    data = []
    with open(jsonl_02, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            if i < num_lines:
                data.append(json.loads(line.strip()))
            else:
                break
    df_check = pd.DataFrame(data)
    return df_check

df_check = display_first_n_lines_to_dataframe(jsonl_02, num_lines=60)

In [40]:
df_check.head(60)

Unnamed: 0,groups,packaging,name,ecoscore_groups,ecoscore_note,code,countries,ingredients,categories,labels_note
0,,none,brandenburger stachelbeere,z,999.0,0,"none, none",,,0
1,sugary snacks,"30g bottle, slim bottle",chocolat au lait,z,999.0,0,"france, united states","mode-d-utilisation-utilisez-la-ut-assaisonner-les-potes-les-pizzas-et-lasagnes, prete-o-l-emploi, 182, alba, www-cascinasancassiano-com, pourioogde-produit-dtmurts-leg-prunes-lg-0-3b-4-22-cascina-san-cassiano-c-so-piave","snacks, sweet-snacks, confectioneries, chewing-gum, sugar-free-chewing-gum",0
2,,none,vitória crackers,z,999.0,17,france,,,0
3,fat and sauces,none,moutarde au moût de raisin,c,54.0,100,france,,"condiments, sauces, mustards, groceries",0
4,,none,sauce sweety chili 0%,z,999.0,123,france,,,0
5,,none,mendiants,z,999.0,291,france,,,0
6,composite foods,none,salade de carottes râpées,b,75.0,949,france,,"plant-based-foods-and-beverages, plant-based-foods, fruits-and-vegetables-based-foods, vegetables-based-foods, meals, vegetables, prepared-vegetables, carrots, carrot-salads, grated-carrots, seasoned-grated-carrots",0
7,,none,fromage blanc aux myrtilles,z,999.0,970,france,,,0
8,,none,solène céréales poulet,z,999.0,1199,france,"antioxidant, colour, tomato, vegetable, fruit-vegetable, mayonnaise, sauce, e316, e150, colza-oil, oil-and-fat, vegetable-oil-and-fat, rapeseed-oil, water, egg-yolk, egg, vinegar, mustard, salt, dextrose, added-sugar, monosaccharide, glucose, stabiliser, preservative, flavouring, mustard-seed, condiment, spice, turmeric, e466, e202, e160a",,0
9,composite foods,"plastique, barquette",tarte noix de coco,z,999.0,1281,france,,"sweet-pies, pies, coconut-pies",0
