In [82]:
import pandas as pd
import numpy as np
import json
import os
import warnings
from datetime import datetime 
from collections import OrderedDict
from collections import Counter
import plotly.express as px
from collections import defaultdict
import plotly.graph_objects as go


pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [83]:
chunk_size = 1000
file_id = '01'
project_path = "/home/carolus/Documents/school/green_ia/" 
jsonl_02 = project_path + 'data/' + file_id + '_openfoodfacts_04.jsonl' 
jsonl_sample = project_path + 'data/' + file_id + "_openfoodfacts_sample.jsonl"
# récupérer la date du jour 
current_date_time = datetime.now()
date_format = "%d/%m/%Y %H:%M:%S.%f"
start_date = current_date_time.strftime("%d/%m/%Y %H:%M:%S.%f")[:-3]
date_code = current_date_time.strftime('%d%m%Y%H%M%S') + f"{current_date_time.microsecond // 1000:03d}"

In [84]:
def add_logs(logData):
    print(logData)
    #with open(f"{project_path}logs/03_analysis_{date_code}_logs.txt", "a") as logFile:
     #   logFile.write(f'{logData}\n')

In [85]:
# verifie la validité de la structure du fichier jsonl
with open(jsonl_02, 'r') as file:
    for line in file:
        try:
            json_object = json.loads(line)
        except json.JSONDecodeError as e:
            add_logs(f"ERROR decoding jsonl: {e}")

add_logs(f"jsonl format valid: {jsonl_02}")

jsonl format valid: /home/carolus/Documents/school/green_ia/data/01_openfoodfacts_04.jsonl


# COUNTRIES:

In [86]:
# retourne une liste des pays présents dans le fichier
def extract_countries_from_jsonl(file_path):
    countries = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                country = record.get('countries')
                if country is not None:
                    countries.append(country)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")    
    return countries
countries_list = extract_countries_from_jsonl(jsonl_02)

separated_countries = []
for entry in countries_list:
    countries = [country.strip() for country in entry.split(',')]
    separated_countries.extend(countries)

country_counts = Counter(separated_countries)
total_countries = sum(country_counts.values())

data_graph_countries = []
for country, count in country_counts.items():
    percentage = (count / total_countries) * 100
    print(f"{country}: {percentage:.2f}%")
    data_graph_countries.append({'countries': country, 'percentage': percentage})

: 100.00%


In [87]:
df = pd.DataFrame(data_graph_countries)
fig = px.treemap(df, path=['countries'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='countries treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# ECOSCORE GRAD:

In [88]:
# retourne une liste des notes écoscores présentes dans le fichier
def extract_grad_ecoscore_from_jsonl(file_path):
    ecoscore_grad_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                ecoscore_grad = record.get('ecoscore_note')
                if ecoscore_grad is not None:
                    ecoscore_grad_list.append(ecoscore_grad)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")
    return ecoscore_grad_list
ecoscore_grad_list = extract_grad_ecoscore_from_jsonl(jsonl_02)

ecoscore_grad_counts = Counter(ecoscore_grad_list)
total_ecoscore_grads = sum(ecoscore_grad_counts.values())
data_graph_ecoscore_grad = []
for ecoscore_grad, count in ecoscore_grad_counts.items():
    percentage = (count / total_ecoscore_grads) * 100
    print(f"{ecoscore_grad}: {percentage:.2f}%")
    data_graph_ecoscore_grad.append({'ecoscore_grad': ecoscore_grad, 'percentage': percentage})

32.0: 0.80%
59.0: 0.85%
43.0: 1.52%
34.0: 3.60%
67.0: 5.21%
0.0: 2.43%
65.0: 0.62%
23.0: 0.70%
79.0: 7.28%
12.0: 1.31%
57.0: 0.96%
4.0: 0.74%
35.0: 1.23%
37.0: 1.54%
71.0: 1.10%
63.0: 0.62%
10.0: 0.17%
68.0: 1.10%
86.0: 0.12%
41.0: 0.66%
25.0: 1.34%
14.0: 0.40%
52.0: 0.67%
60.0: 1.49%
77.0: 2.09%
19.0: 1.26%
13.0: 0.29%
76.0: 1.45%
50.0: 1.69%
78.0: 1.40%
39.0: 1.95%
16.0: 1.07%
53.0: 0.74%
36.0: 1.41%
21.0: 1.96%
22.0: 1.01%
91.0: 0.26%
5.0: 0.39%
75.0: 1.82%
54.0: 1.39%
26.0: 0.50%
3.0: 0.32%
1.0: 0.26%
49.0: 2.56%
66.0: 0.84%
44.0: 1.50%
72.0: 3.90%
100.0: 1.22%
27.0: 1.77%
30.0: 1.07%
2.0: 0.46%
55.0: 0.91%
74.0: 1.23%
31.0: 0.62%
38.0: 1.26%
81.0: 0.34%
20.0: 0.45%
64.0: 1.17%
45.0: 1.29%
9.0: 0.56%
58.0: 1.01%
18.0: 0.89%
40.0: 0.81%
51.0: 0.84%
8.0: 0.39%
17.0: 0.44%
84.0: 0.17%
15.0: 1.69%
42.0: 1.83%
94.0: 0.14%
46.0: 0.51%
62.0: 1.09%
83.0: 0.25%
73.0: 0.85%
48.0: 0.83%
90.0: 0.21%
70.0: 1.12%
24.0: 0.59%
85.0: 0.31%
61.0: 0.36%
99.0: 0.10%
6.0: 0.43%
28.0: 0.75%
47.0: 0.82%


In [89]:
df = pd.DataFrame(data_graph_ecoscore_grad)
fig = px.treemap(df, path=['ecoscore_grad'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='ecoscore grad treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# ECOSCORE GROUPS:

In [90]:
# retourne une liste des lettres écoscore présentes dans le fichier
def extract_groups_ecoscore_from_jsonl(file_path):
    ecoscore_groups_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                ecoscore_groups = record.get('ecoscore_groups')
                if ecoscore_groups is not None:
                    ecoscore_groups_list.append(ecoscore_groups)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")
    return ecoscore_groups_list
ecoscore_groups_list = extract_groups_ecoscore_from_jsonl(jsonl_02)

ecoscore_groups_counts = Counter(ecoscore_groups_list)
total_ecoscore_groups = sum(ecoscore_groups_counts.values())

data_graph_ecoscore_groups = []
for ecoscore_group, count in ecoscore_groups_counts.items():
    percentage = (count / total_ecoscore_groups) * 100
    print(f"{ecoscore_group}: {percentage:.2f}%")
    data_graph_ecoscore_groups.append({'ecoscore_group': ecoscore_group, 'percentage': percentage})


d: 23.89%
c: 22.05%
b: 35.48%
e: 13.80%
a: 4.78%


In [91]:
df = pd.DataFrame(data_graph_ecoscore_groups)
fig = px.treemap(df, path=['ecoscore_group'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='ecoscore groups treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# LABELS:

In [92]:
# retourne une liste des labels présents dans le fichier (sans doublons dans l'affichage)
def extract_labels_from_jsonl(file_path):
    labels_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                labels = record.get('labels_note')
                if labels is not None:
                    labels_list.append(labels)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")
    return labels_list

labels_list = extract_labels_from_jsonl(jsonl_02)
label_counts = Counter(labels_list)
total_labels = sum(label_counts.values())

data_graph_labels = []
for label, count in label_counts.items():
    percentage = (count / total_labels) * 100
    print(f"{label}: {percentage:.2f}%")
    data_graph_labels.append({'labels': label, 'percentage': percentage})

7: 0.90%
2: 10.05%
0: 60.27%
3: 6.02%
4: 3.41%
1: 14.29%
5: 1.94%
9: 0.99%
8: 0.58%
6: 1.54%


In [93]:
df = pd.DataFrame(data_graph_labels)
fig = px.treemap(df, path=['labels'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='labels treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# NONE NUMBER:

In [94]:
def count_none_and_total_values(jsonl_file_path):
    none_counts = defaultdict(int)
    total_counts = defaultdict(int)
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            for key, value in data.items():
                total_counts[key] += 1
                if value is None:
                    none_counts[key] += 1
    return none_counts, total_counts

def calculate_percentage(none_counts, total_counts):
    percentages = {}
    for key in none_counts:
        if total_counts[key] > 0:
            percentage = (none_counts[key] / total_counts[key]) * 100
        else:
            percentage = 0
        percentages[key] = percentage
    return percentages

none_counts, total_counts = count_none_and_total_values(jsonl_02)
percentages = calculate_percentage(none_counts, total_counts)

In [95]:
data_graph_none = [{'key': key, 'percentage': percentage} for key, percentage in percentages.items()]
df = pd.DataFrame(data_graph_none)
fig = px.treemap(df, path=['key'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='none treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# ECOSCORE NULL:

In [96]:
def count_specific_values(jsonl_file_path):
    counts = {
        'ecoscore_groups': {'z': 0},
        'ecoscore_note': {999: 0}
    }
    total_counts = {
        'ecoscore_groups': 0,
        'ecoscore_note': 0
    }
    with open(jsonl_file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            for key, value in data.items():
                if key in counts:
                    total_counts[key] += 1
                    if key == 'ecoscore_groups' and value == 'z':
                        counts[key]['z'] += 1
                    if key == 'ecoscore_note' and value == 999:
                        counts[key][999] += 1
    return counts, total_counts

def calculate_percentage(count, total):
    if total > 0:
        percentage = (count / total) * 100
    else:
        percentage = 0
    return percentage

counts, total_counts = count_specific_values(jsonl_02)

z_percentage = calculate_percentage(counts['ecoscore_groups']['z'], total_counts['ecoscore_groups'])
number_999 = counts['ecoscore_note'][999]
number_999_percentage = calculate_percentage(number_999, total_counts['ecoscore_note'])

labels = ['Ecoscore Groups (z)', 'Ecoscore Note (999)']
values = [z_percentage, number_999_percentage]
counts_values = [counts['ecoscore_groups']['z'], counts['ecoscore_note'][999]]

In [97]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=labels,
    y=values,
    name='Pourcentage',
    text=[f'{v:.2f}%' for v in values],
    textposition='auto'
))
fig.add_trace(go.Bar(
    x=labels,
    y=counts_values,
    name='Nombre',
    text=[str(v) for v in counts_values],
    textposition='auto'
))
fig.update_layout(
    title='percentage empty data ecoscore group and grad',
    xaxis_title='Catégories',
    yaxis_title='Valeurs',
    barmode='group'
)
fig.show()

# TOTAL ARTICLES NUMBER:

In [98]:
print(f"total product number: {total_labels}")

total product number: 1211079


# JSONL SAMPLE:

In [99]:
df = pd.read_json(jsonl_sample, lines=True)
df.head(3)

Unnamed: 0,groups,packaging,name,ecoscore_groups,ecoscore_note,code,countries,ingredients,categories,labels_note
0,composite foods,none,gourmet pizza,,,750369051112,,"mozzarella-cheese-pizza, crust, rice-flour, flour, rice, modified-rice-starch, starch, modified-starch, rice-starch, potato-starch, canola, vegetable, root-vegetable, rapeseed, olive-oil-blend, evaporated-cane-sugar, tapioca-flour, tapioca, potato-flour, tuber, potato, fresh-yeast, yeast, baker-s-yeast, salt, e415, and-calcium-sulfate, sauce, tomato-puree, fruit-vegetable, tomato, parmigiano-reggiano, dairy, cheese, e460ii, e460, oregano, herb, pepper, seed, pasteurized-semi-skimmed-milk, milk, semi-skimmed-milk, lactic-ferments, ferment, microbial-culture, enzyme, water, added-for-freshness, e330, cultured-part-skim-cow-s-milk, anti-caking-agent, pasteurised-milk","meals, pizzas-pies-and-quiches, pizzas",0
1,fruits and vegetables,0,framboise noire hve3 100gx4,b,79.0,13251212512000151552,,"raspberry, fruit, berries","plant-based-foods-and-beverages, plant-based-foods, fruits-and-vegetables-based-foods, fruits-based-foods, fruits, berries, raspberries",0
2,cereals and potatoes,,pan de pueblo,,,8420622349295,,,"plant-based-foods-and-beverages, plant-based-foods, cereals-and-potatoes, breads, white-breads",0


In [100]:
def display_first_n_lines_to_dataframe(jsonl_02, num_lines):
    data = []
    with open(jsonl_02, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            if i < num_lines:
                data.append(json.loads(line.strip()))
            else:
                break
    df_check = pd.DataFrame(data)
    return df_check

df_check = display_first_n_lines_to_dataframe(jsonl_02, num_lines=60)

In [101]:
df_check.head(60)

Unnamed: 0,groups,packaging,name,ecoscore_groups,ecoscore_note,code,countries,ingredients,categories,labels_note
0,fish meat eggs,"plastique, frais, barquette en carton, barquette en plastique",jambon cuit à l'étouffée,d,32.0,7613036624930,,"ham, animal, meat, pork, pork-meat, salt, natural-flavouring, flavouring, beet-sugar, added-sugar, disaccharide, sugar, e300","meats-and-their-products, meats, prepared-meats, hams, white-hams",7
1,fat and sauces,tube,soft mustard,c,59.0,7610145651246,,"water, mustard-seed, condiment, mustard, spice, table-vinegar, vinegar, salt, sugar, added-sugar, disaccharide, turmeric","condiments, sauces, mustards, groceries",2
2,,none,origan,c,43.0,7624841337090,,,"plant-based-foods-and-beverages, plant-based-foods, condiments, culinary-plants, aromatic-plants, herbs, oregano",0
3,milk and dairy products,frais,fagotin light,d,34.0,5410578004198,,"pasteurised-milk, dairy, milk, milk-proteins, protein, animal-protein, salt, lactic-ferments, ferment, microbial-culture, microbial-coagulating-enzyme, enzyme, coagulating-enzyme, colour, e160b","dairies, fermented-foods, fermented-milk-products, cheeses",0
4,salty snacks,none,munchos,b,67.0,60410048023,,"potato-flakes, vegetable, root-vegetable, tuber, potato, vegetable-oil, oil-and-fat, vegetable-oil-and-fat, cornmeal, cereal, corn, potato-starch, starch, salt, yeast, e450i, e450, e330","plant-based-foods-and-beverages, plant-based-foods, snacks, cereals-and-potatoes, salty-snacks, appetizers, chips-and-fries, crisps, potato-crisps",0
5,fish meat eggs,dose,thunfisch filets geschnitten,e,0.0,42285885,,,"seafood, fishes-and-their-products, canned-foods, fishes, fatty-fishes, canned-fishes, tunas, canned-tunas, tuna-in-brine",0
6,composite foods,"métal, boîte, conserve",choucroute garnie,b,67.0,3250390001461,,"vegetable, salt, potato, root-vegetable, tuber, prepared-meat, meat, water, mechanically-separated-poultry-meat, poultry, poultry-meat, mechanically-separated-meat, pork, animal, pork-by-product, wheat-flour, cereal, flour, wheat, cereal-flour, modified-potato-starch, starch, modified-starch, potato-starch, e250, e452, e407, e401, carob-seed-flour, legume, pulse, carob-bean-locust-bean, spice, condiment, glucose-syrup, added-sugar, monosaccharide, glucose, flavouring, minerals, e301, prepared-sauce, sauce, e621, pork-meat, lard, oil-and-fat, fat, animal-fat, pork-fat, juniper-berry, plant, juniper, coriander-seed, seed, herb, coriander, sauerkraut, brassica, cabbage, white-cabbage, cured-sausage","plant-based-foods-and-beverages, plant-based-foods, fruits-and-vegetables-based-foods, canned-foods, vegetables-based-foods, meals, fermented-vegetables, sauerkrauts, canned-meals, sauerkraut-with-garnish",2
7,sugary snacks,none,chick-o-stick,,,41168631675,,"cane-sugar, added-sugar, disaccharide, sugar, corn-syrup, peanut-paste, nut, peanut, coconut, fruit, natural-and-artificial-flavouring, flavouring, natural-flavouring, artificial-flavouring, e102, e127, e129, e133, contains-fresh-ground-dry-roasted-peanuts, salt, and-tocopherol, preservative, e223, vitamin-e","snacks, sweet-snacks, confectioneries",0
8,fish meat eggs,plastica,bon roll con speck,,,8008110003159,,"turkey-meat, poultry, turkey, chicken, skimmed-milk-powder, dairy, milk-powder, egg, bacon, animal, meat, pork, pork-meat, wheat-flour, cereal, flour, wheat, cereal-flour, cheese, salt, milk-proteins, protein, animal-protein, butter, antioxidant, garlic, vegetable, root-vegetable, onion-family-vegetable, parsley, herb, leaf-vegetable, rosemary, flavouring, acidity-regulator, spice, condiment, dextrose, added-sugar, monosaccharide, glucose, preservative, e300, e330, e250, e301","meats-and-their-products, meat-preparations, meats",0
9,milk and dairy products,,petit fresa,b,65.0,8431876307832,,"pasteurized-skimmed-milk, dairy, milk, pasteurised-milk, skimmed-milk, sugar, added-sugar, disaccharide, cream, strawberry, fruit, berries, water, lactic-ferments, ferment, microbial-culture, natural-flavouring, flavouring, vitamin-d, acidity-regulator, e330","dairies, fermented-foods, fermented-milk-products, desserts, dairy-desserts, fermented-dairy-desserts, fermented-dairy-desserts-with-fruits, petits-suisses, petit-suisse-with-fruits",3
