In [1]:
import pandas as pd
import numpy as np
import json
import os
import warnings
from datetime import datetime 
from collections import OrderedDict
from collections import Counter
import plotly.express as px
from collections import defaultdict
import plotly.graph_objects as go


pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [2]:
chunk_size = 1000
file_id = '02'
project_path = "/home/carolus/Documents/school/green_ia/" 
jsonl_02 = project_path + 'data/' + file_id + '_openfoodfacts_02.jsonl' 
jsonl_sample = project_path + 'data/' + file_id + "_openfoodfacts_sample.jsonl"
# récupérer la date du jour 
current_date_time = datetime.now()
date_format = "%d/%m/%Y %H:%M:%S.%f"
start_date = current_date_time.strftime("%d/%m/%Y %H:%M:%S.%f")[:-3]
date_code = current_date_time.strftime('%d%m%Y%H%M%S') + f"{current_date_time.microsecond // 1000:03d}"

In [3]:
def add_logs(logData):
    print(logData)
    #with open(f"{project_path}logs/03_analysis_{date_code}_logs.txt", "a") as logFile:
     #   logFile.write(f'{logData}\n')

In [4]:
# verifie la validité de la structure du fichier jsonl
with open(jsonl_02, 'r') as file:
    for line in file:
        try:
            json_object = json.loads(line)
        except json.JSONDecodeError as e:
            add_logs(f"ERROR decoding jsonl: {e}")

add_logs(f"jsonl format valid: {jsonl_02}")

jsonl format valid: /home/carolus/Documents/school/green_ia/data/02_openfoodfacts_03.jsonl


# COUNTRIES:

In [5]:
# retourne une liste des pays présents dans le fichier
def extract_countries_from_jsonl(file_path):
    countries = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                country = record.get('countries')
                if country is not None:
                    countries.append(country)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")    
    return countries
countries_list = extract_countries_from_jsonl(jsonl_02)

separated_countries = []
for entry in countries_list:
    countries = [country.strip() for country in entry.split(',')]
    separated_countries.extend(countries)

country_counts = Counter(separated_countries)
total_countries = sum(country_counts.values())

data_graph_countries = []
for country, count in country_counts.items():
    percentage = (count / total_countries) * 100
    print(f"{country}: {percentage:.2f}%")
    data_graph_countries.append({'countries': country, 'percentage': percentage})

italy: 6.82%
belgium: 2.21%
france: 30.19%
spain: 8.85%
far: 8.35%
netherlands: 0.43%
canada: 2.45%
united states: 18.75%
united kingdom: 3.20%
poland: 0.23%
germany: 7.68%
sweden: 0.12%
norway: 0.09%
none: 0.37%
australia: 1.15%
ireland: 1.59%
world: 1.95%
switzerland: 2.43%
qatar: 0.02%
french polynesia: 0.01%
russia: 0.27%
czech republic: 0.09%
austria: 0.10%
luxembourg: 0.04%
puerto rico: 0.01%
new zealand: 0.07%
united arab emirates: 0.02%
morocco: 0.04%
croatia: 0.03%
singapore: 0.08%
philippines: 0.05%
slovakia: 0.03%
brazil: 0.38%
hungary: 0.04%
malaysia: 0.02%
mexico: 0.13%
israel: 0.01%
japan: 0.20%
china: 0.01%
kazakhstan: 0.00%
ukraine: 0.01%
taiwan: 0.00%
finland: 0.08%
greece: 0.02%
portugal: 0.11%
bulgaria: 0.09%
thailand: 0.06%
bolivia: 0.05%
: 0.09%
lithuania: 0.06%
senegal: 0.00%
south africa: 0.04%
india: 0.11%
algeria: 0.02%
argentina: 0.06%
indonesia: 0.06%
denmark: 0.08%
montenegro: 0.00%
romania: 0.09%
serbia: 0.02%
slovenia: 0.01%
uruguay: 0.01%
reunion: 0.03%
c

In [6]:
df = pd.DataFrame(data_graph_countries)
fig = px.treemap(df, path=['countries'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='countries treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# ECOSCORE GRAD:

In [7]:
# retourne une liste des notes écoscores présentes dans le fichier
def extract_grad_ecoscore_from_jsonl(file_path):
    ecoscore_grad_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                ecoscore_grad = record.get('ecoscore_note')
                if ecoscore_grad is not None:
                    ecoscore_grad_list.append(ecoscore_grad)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")
    return ecoscore_grad_list
ecoscore_grad_list = extract_grad_ecoscore_from_jsonl(jsonl_02)

ecoscore_grad_counts = Counter(ecoscore_grad_list)
total_ecoscore_grads = sum(ecoscore_grad_counts.values())
data_graph_ecoscore_grad = []
for ecoscore_grad, count in ecoscore_grad_counts.items():
    percentage = (count / total_ecoscore_grads) * 100
    print(f"{ecoscore_grad}: {percentage:.2f}%")
    data_graph_ecoscore_grad.append({'ecoscore_grad': ecoscore_grad, 'percentage': percentage})

999.0: 74.62%
55.0: 0.23%
69.0: 0.18%
74.0: 0.31%
25.0: 0.34%
14.0: 0.10%
78.0: 0.36%
41.0: 0.17%
15.0: 0.43%
57.0: 0.24%
65.0: 0.16%
75.0: 0.48%
95.0: 0.03%
32.0: 0.20%
67.0: 1.32%
21.0: 0.50%
31.0: 0.16%
72.0: 0.99%
49.0: 0.65%
19.0: 0.32%
79.0: 1.84%
82.0: 0.06%
50.0: 0.43%
60.0: 0.38%
76.0: 0.38%
100.0: 0.31%
51.0: 0.22%
43.0: 0.39%
77.0: 0.51%
59.0: 0.21%
63.0: 0.22%
3.0: 0.08%
34.0: 0.92%
58.0: 0.26%
56.0: 0.17%
39.0: 0.51%
1.0: 0.07%
22.0: 0.26%
27.0: 0.45%
91.0: 0.07%
45.0: 0.33%
26.0: 0.13%
18.0: 0.20%
44.0: 0.38%
36.0: 0.36%
71.0: 0.27%
53.0: 0.19%
97.0: 0.02%
66.0: 0.22%
0.0: 0.62%
64.0: 0.24%
5.0: 0.10%
73.0: 0.22%
29.0: 0.21%
54.0: 0.35%
38.0: 0.31%
28.0: 0.19%
61.0: 0.15%
37.0: 0.39%
6.0: 0.11%
2.0: 0.12%
84.0: 0.04%
70.0: 0.29%
42.0: 0.46%
35.0: 0.31%
33.0: 0.12%
68.0: 0.28%
94.0: 0.04%
8.0: 0.10%
30.0: 0.27%
89.0: 0.04%
62.0: 0.22%
16.0: 0.27%
85.0: 0.08%
48.0: 0.21%
4.0: 0.19%
11.0: 0.05%
47.0: 0.21%
20.0: 0.11%
96.0: 0.03%
12.0: 0.33%
7.0: 0.03%
9.0: 0.14%
40.0: 0.20%

In [8]:
df = pd.DataFrame(data_graph_ecoscore_grad)
fig = px.treemap(df, path=['ecoscore_grad'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='ecoscore grad treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# ECOSCORE GROUPS:

In [9]:
# retourne une liste des lettres écoscore présentes dans le fichier
def extract_groups_ecoscore_from_jsonl(file_path):
    ecoscore_groups_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                ecoscore_groups = record.get('ecoscore_groups')
                if ecoscore_groups is not None:
                    ecoscore_groups_list.append(ecoscore_groups)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")
    return ecoscore_groups_list
ecoscore_groups_list = extract_groups_ecoscore_from_jsonl(jsonl_02)

ecoscore_groups_counts = Counter(ecoscore_groups_list)
total_ecoscore_groups = sum(ecoscore_groups_counts.values())

data_graph_ecoscore_groups = []
for ecoscore_group, count in ecoscore_groups_counts.items():
    percentage = (count / total_ecoscore_groups) * 100
    print(f"{ecoscore_group}: {percentage:.2f}%")
    data_graph_ecoscore_groups.append({'ecoscore_group': ecoscore_group, 'percentage': percentage})


z: 74.62%
c: 5.59%
b: 9.01%
d: 6.06%
e: 3.51%
a: 1.21%


In [10]:
df = pd.DataFrame(data_graph_ecoscore_groups)
fig = px.treemap(df, path=['ecoscore_group'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='ecoscore groups treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# LABELS:

In [11]:
# retourne une liste des labels présents dans le fichier (sans doublons dans l'affichage)
def extract_labels_from_jsonl(file_path):
    labels_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                labels = record.get('labels_note')
                if labels is not None:
                    labels_list.append(labels)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")
    return labels_list

labels_list = extract_labels_from_jsonl(jsonl_02)
label_counts = Counter(labels_list)
total_labels = sum(label_counts.values())

data_graph_labels = []
for label, count in label_counts.items():
    percentage = (count / total_labels) * 100
    print(f"{label}: {percentage:.2f}%")
    data_graph_labels.append({'labels': label, 'percentage': percentage})

0: 70.95%
2: 8.13%
1: 11.32%
3: 4.34%
5: 1.11%
4: 2.05%
7: 0.47%
6: 0.88%
9: 0.46%
8: 0.29%


In [12]:
df = pd.DataFrame(data_graph_labels)
fig = px.treemap(df, path=['labels'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='labels treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# NONE NUMBER:

In [13]:
def count_none_and_total_values(jsonl_file_path):
    none_counts = defaultdict(int)
    total_counts = defaultdict(int)
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            for key, value in data.items():
                total_counts[key] += 1
                if value is None:
                    none_counts[key] += 1
    return none_counts, total_counts

def calculate_percentage(none_counts, total_counts):
    percentages = {}
    for key in none_counts:
        if total_counts[key] > 0:
            percentage = (none_counts[key] / total_counts[key]) * 100
        else:
            percentage = 0
        percentages[key] = percentage
    return percentages

none_counts, total_counts = count_none_and_total_values(jsonl_02)
percentages = calculate_percentage(none_counts, total_counts)

In [14]:
data_graph_none = [{'key': key, 'percentage': percentage} for key, percentage in percentages.items()]
df = pd.DataFrame(data_graph_none)
fig = px.treemap(df, path=['key'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='none treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# ECOSCORE NULL:

In [15]:
def count_specific_values(jsonl_file_path):
    counts = {
        'ecoscore_groups': {'z': 0},
        'ecoscore_note': {999: 0}
    }
    total_counts = {
        'ecoscore_groups': 0,
        'ecoscore_note': 0
    }
    with open(jsonl_file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            for key, value in data.items():
                if key in counts:
                    total_counts[key] += 1
                    if key == 'ecoscore_groups' and value == 'z':
                        counts[key]['z'] += 1
                    if key == 'ecoscore_note' and value == 999:
                        counts[key][999] += 1
    return counts, total_counts

def calculate_percentage(count, total):
    if total > 0:
        percentage = (count / total) * 100
    else:
        percentage = 0
    return percentage

counts, total_counts = count_specific_values(jsonl_02)

z_percentage = calculate_percentage(counts['ecoscore_groups']['z'], total_counts['ecoscore_groups'])
number_999 = counts['ecoscore_note'][999]
number_999_percentage = calculate_percentage(number_999, total_counts['ecoscore_note'])

labels = ['Ecoscore Groups (z)', 'Ecoscore Note (999)']
values = [z_percentage, number_999_percentage]
counts_values = [counts['ecoscore_groups']['z'], counts['ecoscore_note'][999]]

In [16]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=labels,
    y=values,
    name='Pourcentage',
    text=[f'{v:.2f}%' for v in values],
    textposition='auto'
))
fig.add_trace(go.Bar(
    x=labels,
    y=counts_values,
    name='Nombre',
    text=[str(v) for v in counts_values],
    textposition='auto'
))
fig.update_layout(
    title='percentage empty data ecoscore group and grad',
    xaxis_title='Catégories',
    yaxis_title='Valeurs',
    barmode='group'
)
fig.show()

# TOTAL ARTICLES NUMBER:

In [17]:
print(f"total product number: {total_labels}")

total product number: 3177829


# JSONL SAMPLE:

In [20]:
df = pd.read_json(jsonl_sample, lines=True)
df.head(60)

Unnamed: 0,groups,packaging,name,ecoscore_groups,ecoscore_note,code,countries,ingredients,categories,labels_note
0,Sugary snacks,,"Johnvince foods, sour watermelon slices",z,999,64777813331,united states,"sugar, added-sugar, disaccharide, glucose-syrup, monosaccharide, glucose, modified-corn-starch, starch, corn-starch, modified-starch, e330, e334, sodium-citrate, minerals, sodium, e270, natural-and-artificial-flavouring, flavouring, natural-flavouring, artificial-flavouring, e102, e110, e129","snacks, sweet-snacks, confectioneries",0
1,,,Beef With Teriyaki Sauce And Noodles,z,999,118894904172,united states,"flour-battered-beef, noodle, dough, teriyaki-sauce, sauce, pineapple-juice, fruit, juice, fruit-juice, pineapple, beef, animal, flour, canola-oil, oil-and-fat, vegetable-oil-and-fat, rapeseed-oil, wheat-flour, cereal, wheat, cereal-flour, water, canola, vegetable, root-vegetable, rapeseed, soya-oil, vegetable-oil, dried-epp-powder, egg-white, egg, salt, corn-starch, starch, fd-c-yellow-5-and-6, e211, e297, e300, vitamins, vitamin-e, vitamin-a, soy-sauce, sugar, added-sugar, disaccharide, ginger, condiment, spice, onion, onion-family-vegetable, potato-starch, garlic, black-pepper, seed, pepper, barley-malt-flour, barley, barley-flour, e375, reduced-iron, minerals, iron, thiamin-mononitrate, thiamin, e101, folic-acid, folate, vitamin-c, soya-bean, legume, pulse, soya, rice",,0
2,Sugary snacks,,"Ktm, Enjoy, Juicy Gummy Bears",b,69,679757140132,united states,"corn-syrup, added-sugar, disaccharide, e428, e420, pectic, e330, e296, sodium-citrate, minerals, sodium, medium-chain-triglycerides, e903, artificial-flavouring, flavouring, plum, fruit, prunus-species-fruit, e954, sucralose-caramel-color-aspartame-fd-c-red-40, e102, e133, beef, animal, coconut-oil, oil-and-fat, vegetable-oil-and-fat, vegetable-oil","snacks, sweet-snacks, confectioneries, candies, gummi-candies, gummy-bears",0
3,,,Emmentaler Switzerland Emmentaler Gotthelf,z,999,2107879007258,switzerland,"milk, dairy, salt",,1
4,Cereals and potatoes,"Plastique,Sachet",Muffins complets,b,71,3245412694516,france,"whole-wheat-flour, cereal, flour, wheat, cereal-flour, wholemeal-flour, wheat-flour, water, wheat-gluten, gluten, yeast, durum-wheat-semolina, durum-wheat, semolina, wheat-fiber, fiber, vegetable-fiber, sugar, added-sugar, disaccharide, salt, preservative, e282, e200","plant-based-foods-and-beverages, plant-based-foods, cereals-and-potatoes, breads, special-breads, english-muffins",5
5,Composite foods,"Frozen,Sachet plastique",Emincés de poireaux à la crème,b,79,3250390561019,france,"leek, vegetable, onion-family-vegetable, fresh-cream, dairy, cream, water, wheat-flour, cereal, flour, wheat, cereal-flour, butter, salt, poultry-broth, poultry, broth, modified-potato-starch, starch, modified-starch, potato-starch, pepper, seed, nutmeg, condiment, spice, dextrose, added-sugar, monosaccharide, glucose, poultry-fat, oil-and-fat, fat, animal-fat, flavouring, sugar, disaccharide, herbs-and-spices, herb, celery, stalk-vegetable","plant-based-foods-and-beverages, plant-based-foods, fruits-and-vegetables-based-foods, vegetables-based-foods, meals, vegetables, vegetable-rods, leeks",0
6,,,Iglo Grillade Océane Côté Sud Colin D'alaska Sauge,z,999,3661405000162,france,,,6
7,Sugary snacks,,Chocolat bonnat,e,15,3700152400747,france,,"snacks, sweet-snacks, cocoa-and-its-products, chocolates, dark-chocolates",0
8,,,Sauce Samorai,z,999,3760257870913,france,,,0
9,,,Nescafe,z,999,4902201406147,far,,,0


In [24]:
def display_first_n_lines_to_dataframe(jsonl_02, num_lines):
    data = []
    with open(jsonl_02, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            if i < num_lines:
                data.append(json.loads(line.strip()))
            else:
                break
    df_check = pd.DataFrame(data)
    return df_check

df_check = display_first_n_lines_to_dataframe(jsonl_02, num_lines=60)

In [25]:
df_check.head(60)

Unnamed: 0,groups,packaging,name,ecoscore_groups,ecoscore_note,code,countries,ingredients,categories,labels_note
0,Cereals and potatoes,,Tripolini,z,999.0,8002330096939,italy,,"plant-based-foods-and-beverages, plant-based-foods, cereals-and-potatoes, cereals-and-their-products, pastas, cereal-pastas, dry-pastas, durum-wheat-pasta",0
1,,,Pomodoro Cigliegino cat 1,z,999.0,8023772000797,italy,,,0
2,,,Poulet piquant,z,999.0,5400113549062,belgium,,,0
3,,,Chorizo doux des pyrénées,z,999.0,3067163641478,france,,,2
4,,,Coctail mediterráneo,z,999.0,8410909269293,spain,,,0
5,Milk and dairy products,,Red leicester,c,55.0,5010482618049,spain,,"dairies, fermented-foods, fermented-milk-products, cheeses, cheeses-from-the-united-kingdom, cheeses-from-england, red-leicester",1
6,,,LEAMO CITRONNADE,z,999.0,3700749305646,france,,,2
7,Sugary snacks,,Mini-oursons gélifiés,b,69.0,3700345904250,france,"sugar, added-sugar, disaccharide, corn-syrup, e428, acid, natural-lemon-flavouring, flavouring, natural-flavouring, natural-pear-flavouring, pear-flavouring, natural-orange-flavouring, natural-strawberry-flavouring, strawberry-flavouring, colour, orange, fruit, citrus-fruit, e330, pineapple, cherry, prunus-species-fruit, e122","snacks, sweet-snacks, confectioneries, candies, gummi-candies",1
8,Fat and sauces,,Ketchup Neîndulcit,b,74.0,7310240604375,far,"salt, natural-flavouring, flavouring, clove, condiment, spice","condiments, sauces, tomato-sauces, ketchup, tomato-ketchup",0
9,,,Mirtilli,z,999.0,8001300659532,italy,,,0
