In [41]:
import pandas as pd
import numpy as np
import json
import os
import warnings
from datetime import datetime 
from collections import OrderedDict
from collections import Counter
import plotly.express as px
from collections import defaultdict
import plotly.graph_objects as go


pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [42]:
chunk_size = 1000
file_id = '02'
project_path = "/home/carolus/Documents/school/green_ia/" 
jsonl_02 = project_path + 'data/' + file_id + '_openfoodfacts_02.jsonl' 
jsonl_sample = project_path + 'data/' + file_id + "_openfoodfacts_sample.jsonl"
# récupérer la date du jour 
current_date_time = datetime.now()
date_format = "%d/%m/%Y %H:%M:%S.%f"
start_date = current_date_time.strftime("%d/%m/%Y %H:%M:%S.%f")[:-3]
date_code = current_date_time.strftime('%d%m%Y%H%M%S') + f"{current_date_time.microsecond // 1000:03d}"

In [43]:
def add_logs(logData):
    print(logData)
    #with open(f"{project_path}logs/03_analysis_{date_code}_logs.txt", "a") as logFile:
     #   logFile.write(f'{logData}\n')

In [44]:
# verifie la validité de la structure du fichier jsonl
with open(jsonl_02, 'r') as file:
    for line in file:
        try:
            json_object = json.loads(line)
        except json.JSONDecodeError as e:
            add_logs(f"ERROR decoding jsonl: {e}")

add_logs(f"jsonl format valid: {jsonl_02}")

jsonl format valid: /home/carolus/Documents/school/green_ia/data/02_openfoodfacts_02.jsonl


# COUNTRIES:

In [45]:
# retourne une liste des pays présents dans le fichier
def extract_countries_from_jsonl(file_path):
    countries = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                country = record.get('countries')
                if country is not None:
                    countries.append(country)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")    
    return countries
countries_list = extract_countries_from_jsonl(jsonl_02)

separated_countries = []
for entry in countries_list:
    countries = [country.strip() for country in entry.split(',')]
    separated_countries.extend(countries)

country_counts = Counter(separated_countries)
total_countries = sum(country_counts.values())

data_graph_countries = []
for country, count in country_counts.items():
    percentage = (count / total_countries) * 100
    print(f"{country}: {percentage:.2f}%")
    data_graph_countries.append({'countries': country, 'percentage': percentage})

france: 32.92%
united states: 22.05%
None: 11.52%
world: 1.01%
bolivia: 0.05%
mexico: 0.20%
new zealand: 0.03%
belgium: 2.35%
danemark: 0.00%
norway: 0.04%
poland: 0.25%
austria: 0.03%
hungary: 0.02%
czech republic: 0.04%
thailand: 0.05%
egypt: 0.00%
japan: 0.03%
croatia: 0.01%
europe: 0.01%
luxembourg: 0.01%
argentina: 0.01%
morocco: 0.03%
spain: 6.66%
costa rica: 0.00%
moldavia: 0.00%
sweden: 0.07%
russia: 0.09%
netherlands: 0.56%
finland: 0.02%
united kingdom: 2.67%
saudi arabia: 0.01%
lebanon: 0.00%
philippines: 0.01%
malaysia: 0.00%
guyana: 0.00%
barbados: 0.00%
french polynesia: 0.01%
brazil: 0.08%
haiti: 0.00%
bahrain: 0.00%
colombia: 0.00%
oman: 0.00%
dominican republic: 0.00%
greece: 0.01%
kuwait: 0.00%
hong kong: 0.01%
puerto rico: 0.00%
united arab emirates: 0.00%
ireland: 0.38%
singapore: 0.02%
panama: 0.01%
germany: 7.67%
italy: 5.52%
angola: 0.00%
canada: 1.10%
taiwan: 0.00%
: 0.10%
burkina faso: 0.00%
vietnam: 0.00%
israel: 0.00%
china: 0.01%
bulgaria: 0.04%
ascension is

In [46]:
df = pd.DataFrame(data_graph_countries)
fig = px.treemap(df, path=['countries'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='countries treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# ECOSCORE GRAD:

In [47]:
# retourne une liste des notes écoscores présentes dans le fichier
def extract_grad_ecoscore_from_jsonl(file_path):
    ecoscore_grad_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                ecoscore_grad = record.get('ecoscore_note')
                if ecoscore_grad is not None:
                    ecoscore_grad_list.append(ecoscore_grad)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")
    return ecoscore_grad_list
ecoscore_grad_list = extract_grad_ecoscore_from_jsonl(jsonl_02)

ecoscore_grad_counts = Counter(ecoscore_grad_list)
total_ecoscore_grads = sum(ecoscore_grad_counts.values())
data_graph_ecoscore_grad = []
for ecoscore_grad, count in ecoscore_grad_counts.items():
    percentage = (count / total_ecoscore_grads) * 100
    print(f"{ecoscore_grad}: {percentage:.2f}%")
    data_graph_ecoscore_grad.append({'ecoscore_grad': ecoscore_grad, 'percentage': percentage})

999.0: 42.69%
54.0: 0.80%
75.0: 1.07%
76.0: 0.85%
77.0: 1.15%
52.0: 0.38%
9.0: 0.32%
24.0: 0.34%
50.0: 0.96%
65.0: 0.37%
49.0: 1.47%
70.0: 0.66%
79.0: 4.16%
63.0: 0.49%
29.0: 0.48%
21.0: 1.13%
56.0: 0.38%
92.0: 0.09%
34.0: 2.07%
59.0: 0.48%
55.0: 0.52%
73.0: 0.49%
27.0: 1.02%
19.0: 0.72%
60.0: 0.85%
23.0: 0.38%
47.0: 0.47%
42.0: 1.04%
0.0: 1.39%
41.0: 0.38%
67.0: 2.97%
71.0: 0.62%
74.0: 0.71%
30.0: 0.62%
15.0: 0.97%
58.0: 0.58%
61.0: 0.33%
39.0: 1.15%
91.0: 0.15%
53.0: 0.43%
44.0: 0.86%
14.0: 0.23%
72.0: 2.24%
68.0: 0.64%
12.0: 0.75%
18.0: 0.46%
5.0: 0.22%
51.0: 0.49%
66.0: 0.49%
69.0: 0.40%
43.0: 0.87%
8.0: 0.22%
2.0: 0.27%
22.0: 0.59%
46.0: 0.29%
38.0: 0.70%
26.0: 0.29%
10.0: 0.10%
36.0: 0.81%
37.0: 0.88%
78.0: 0.81%
25.0: 0.76%
64.0: 0.54%
4.0: 0.43%
40.0: 0.46%
35.0: 0.70%
20.0: 0.26%
32.0: 0.46%
85.0: 0.18%
62.0: 0.50%
31.0: 0.36%
28.0: 0.43%
89.0: 0.09%
48.0: 0.48%
83.0: 0.15%
16.0: 0.61%
90.0: 0.12%
57.0: 0.55%
45.0: 0.74%
100.0: 0.70%
33.0: 0.28%
82.0: 0.13%
13.0: 0.17%
81.0: 0

In [48]:
df = pd.DataFrame(data_graph_ecoscore_grad)
fig = px.treemap(df, path=['ecoscore_grad'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='ecoscore grad treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# ECOSCORE GROUPS:

In [49]:
# retourne une liste des lettres écoscore présentes dans le fichier
def extract_groups_ecoscore_from_jsonl(file_path):
    ecoscore_groups_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                ecoscore_groups = record.get('ecoscore_groups')
                if ecoscore_groups is not None:
                    ecoscore_groups_list.append(ecoscore_groups)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")
    return ecoscore_groups_list
ecoscore_groups_list = extract_groups_ecoscore_from_jsonl(jsonl_02)

ecoscore_groups_counts = Counter(ecoscore_groups_list)
total_ecoscore_groups = sum(ecoscore_groups_counts.values())

data_graph_ecoscore_groups = []
for ecoscore_group, count in ecoscore_groups_counts.items():
    percentage = (count / total_ecoscore_groups) * 100
    print(f"{ecoscore_group}: {percentage:.2f}%")
    data_graph_ecoscore_groups.append({'ecoscore_group': ecoscore_group, 'percentage': percentage})


z: 42.69%
c: 12.63%
b: 20.34%
e: 7.92%
d: 13.69%
a: 2.74%


In [50]:
df = pd.DataFrame(data_graph_ecoscore_groups)
fig = px.treemap(df, path=['ecoscore_group'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='ecoscore groups treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# LABELS:

In [51]:
# retourne une liste des labels présents dans le fichier (sans doublons dans l'affichage)
def extract_labels_from_jsonl(file_path):
    labels_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                labels = record.get('labels_note')
                if labels is not None:
                    labels_list.append(labels)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")
    return labels_list

labels_list = extract_labels_from_jsonl(jsonl_02)
label_counts = Counter(labels_list)
total_labels = sum(label_counts.values())

data_graph_labels = []
for label, count in label_counts.items():
    percentage = (count / total_labels) * 100
    print(f"{label}: {percentage:.2f}%")
    data_graph_labels.append({'labels': label, 'percentage': percentage})

0: 61.92%
1: 13.84%
2: 9.63%
3: 5.73%
5: 1.85%
6: 1.46%
4: 3.24%
8: 0.55%
7: 0.85%
9: 0.94%


In [52]:
df = pd.DataFrame(data_graph_labels)
fig = px.treemap(df, path=['labels'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='labels treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# NONE NUMBER:

In [53]:
def count_none_and_total_values(jsonl_file_path):
    none_counts = defaultdict(int)
    total_counts = defaultdict(int)
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            for key, value in data.items():
                total_counts[key] += 1
                if value is None:
                    none_counts[key] += 1
    return none_counts, total_counts

def calculate_percentage(none_counts, total_counts):
    percentages = {}
    for key in none_counts:
        if total_counts[key] > 0:
            percentage = (none_counts[key] / total_counts[key]) * 100
        else:
            percentage = 0
        percentages[key] = percentage
    return percentages

none_counts, total_counts = count_none_and_total_values(jsonl_02)
percentages = calculate_percentage(none_counts, total_counts)

In [54]:
data_graph_none = [{'key': key, 'percentage': percentage} for key, percentage in percentages.items()]
df = pd.DataFrame(data_graph_none)
fig = px.treemap(df, path=['key'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='none treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

# ECOSCORE NULL:

In [55]:
def count_specific_values(jsonl_file_path):
    counts = {
        'ecoscore_groups': {'z': 0},
        'ecoscore_note': {999: 0}
    }
    total_counts = {
        'ecoscore_groups': 0,
        'ecoscore_note': 0
    }
    with open(jsonl_file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            for key, value in data.items():
                if key in counts:
                    total_counts[key] += 1
                    if key == 'ecoscore_groups' and value == 'z':
                        counts[key]['z'] += 1
                    if key == 'ecoscore_note' and value == 999:
                        counts[key][999] += 1
    return counts, total_counts

def calculate_percentage(count, total):
    if total > 0:
        percentage = (count / total) * 100
    else:
        percentage = 0
    return percentage

counts, total_counts = count_specific_values(jsonl_02)

z_percentage = calculate_percentage(counts['ecoscore_groups']['z'], total_counts['ecoscore_groups'])
number_999 = counts['ecoscore_note'][999]
number_999_percentage = calculate_percentage(number_999, total_counts['ecoscore_note'])

labels = ['Ecoscore Groups (z)', 'Ecoscore Note (999)']
values = [z_percentage, number_999_percentage]
counts_values = [counts['ecoscore_groups']['z'], counts['ecoscore_note'][999]]

In [56]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=labels,
    y=values,
    name='Pourcentage',
    text=[f'{v:.2f}%' for v in values],
    textposition='auto'
))
fig.add_trace(go.Bar(
    x=labels,
    y=counts_values,
    name='Nombre',
    text=[str(v) for v in counts_values],
    textposition='auto'
))
fig.update_layout(
    title='percentage empty data ecoscore group and grad',
    xaxis_title='Catégories',
    yaxis_title='Valeurs',
    barmode='group'
)
fig.show()

# TOTAL ARTICLES NUMBER:

In [57]:
print(f"total product number: {total_labels}")

total product number: 1407516


# JSONL SAMPLE:

In [58]:
df = pd.read_json(jsonl_sample, lines=True)
df.head(60)

Unnamed: 0,groups,packaging,name,ecoscore_groups,ecoscore_note,code,countries,ingredients,categories,labels_note
0,cereals and potatoes,none,linguine,z,999,26662523022,united states,"durum-wheat-semolina, cereal, wheat, durum-wheat, semolina, niacin-ferrous-lactate, thiamin-mononitrate, thiamin, e101, folic-acid, folate, iron, minerals, vitamin-b12","plant-based-foods-and-beverages, plant-based-foods, cereals-and-potatoes, cereals-and-their-products, pastas",0
1,salty snacks,mixed plastic-packet,cheesy garlic slice,z,999,200011,france,"wheat-flour, cereal, flour, wheat, cereal-flour, e170i, e170, iron, minerals, e375, thiamin-water-mature-cheddar-cheese, unsalted-butter, dairy, butter, rapeseed-oil-garlic-puree-yeast-wheat-fibre, lemon-juice-salt-parsley, flour-treatment-agent, e300","snacks, salty-snacks, appetizers, garlic-breads",0
2,composite foods,none,"fully cooked oven roasted boneless skinless chicken breast strips with rib meat, oven roasted",z,999,36800397484,united states,"chicken-breast-with-rib-meat, chicken-broth, poultry, chicken, broth, poultry-broth, less-than-2-of, modified-starch, starch, e326, dextrose, added-sugar, monosaccharide, glucose, e339, natural-flavouring, flavouring, yeast-extract, yeast, e262ii, e262, e330, lauric-arginate, e250, salt",meals,0
3,cereals and potatoes,none,organic white arborio rice,e,12,73416040908,"united states, world",white-arborio-rice,"plant-based-foods-and-beverages, plant-based-foods, cereals-and-potatoes, seeds, cereals-and-their-products, cereal-grains, rices, japonica-rices, rices-for-risotto, short-grain-rices, arborio-rices",3
4,cereals and potatoes,none,dark red kidney beans,b,72,99482455804,united states,"dark-red-kidney-beans, water","plant-based-foods-and-beverages, plant-based-foods, legumes-and-their-products, canned-foods, legumes, seeds, canned-plant-based-foods, legume-seeds, pulses, common-beans, canned-legumes, red-beans, canned-common-beans",4
5,beverages,none,vapor distilled water with electrolytes,z,999,711535509103,united states,"vapor-distilled-water, electrolytes, natural-blueberry-and-blackberry-flavors, calcium, minerals, potassium, magnesium","beverages, waters",0
6,milk and dairy products,none,milk shake schoko,z,999,20171117,france,,"beverages, dairies, dairy-drinks, flavoured-milks",2
7,fruits and vegetables,bocal en verre,mogette de vendee,z,999,3250391787012,france,"vegetable, legume, pulse, white-beans, water, sea-salt, salt","plant-based-foods-and-beverages, plant-based-foods, fruits-and-vegetables-based-foods, legumes-and-their-products, legumes, seeds, vegetables-based-foods, legume-seeds, vegetables, pulses, common-beans, white-beans, french-vegetables, white-kidney-beans",3
8,beverages,"plastique, pet - polytéréphtalate d'éthylène, bouteille en plastique",eau de source des montagnes d'auvergne,z,999,3256223510049,france,"magnesium, minerals, sodium, potassium, sulfates, e551, calcium","beverages, waters, spring-waters, mineral-waters, unsweetened-beverages",1
9,milk and dairy products,none,yaourt sur lit d'abricot,b,63,3263858079810,france,"whole-milk, dairy, milk, apricot, fruit, prunus-species-fruit, sugar, added-sugar, disaccharide, glucose-fructose-syrup, monosaccharide, fructose, glucose, milk-proteins, protein, animal-protein, cream, thickener, e440a, flavouring, fruit-preservative, preservative, lactic-ferments, ferment, microbial-culture, modified-tapioca-starch, starch, modified-starch, tapioca, e202, sodium-citrate, minerals, sodium","dairies, fermented-foods, fermented-milk-products, desserts, dairy-desserts, fermented-dairy-desserts, fermented-dairy-desserts-with-fruits, yogurts, fruit-yogurts, apricot-yogurts",0


In [59]:
def display_first_n_lines_to_dataframe(jsonl_02, num_lines):
    data = []
    with open(jsonl_02, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            if i < num_lines:
                data.append(json.loads(line.strip()))
            else:
                break
    df_check = pd.DataFrame(data)
    return df_check

df_check = display_first_n_lines_to_dataframe(jsonl_02, num_lines=60)

In [60]:
df_check.head(60)

Unnamed: 0,groups,packaging,name,ecoscore_groups,ecoscore_note,code,countries,ingredients,categories,labels_note
0,sugary snacks,"30g bottle, slim bottle",chocolat au lait,z,999.0,0,"france, united states","mode-d-utilisation-utilisez-la-ut-assaisonner-les-potes-les-pizzas-et-lasagnes, prete-o-l-emploi, 182, alba, www-cascinasancassiano-com, pourioogde-produit-dtmurts-leg-prunes-lg-0-3b-4-22-cascina-san-cassiano-c-so-piave","snacks, sweet-snacks, confectioneries, chewing-gum, sugar-free-chewing-gum",0
1,fat and sauces,none,moutarde au moût de raisin,c,54.0,100,france,,"condiments, sauces, mustards, groceries",0
2,composite foods,none,salade de carottes râpées,b,75.0,949,france,,"plant-based-foods-and-beverages, plant-based-foods, fruits-and-vegetables-based-foods, vegetables-based-foods, meals, vegetables, prepared-vegetables, carrots, carrot-salads, grated-carrots, seasoned-grated-carrots",0
3,composite foods,"plastique, barquette",tarte noix de coco,z,999.0,1281,france,,"sweet-pies, pies, coconut-pies",0
4,fruits and vegetables,none,compote de poire,b,76.0,1885,france,,"plant-based-foods-and-beverages, plant-based-foods, fruits-and-vegetables-based-foods, desserts, fruits-based-foods, compotes, pear-compotes",0
5,composite foods,none,salade de macedoine de légumes,b,77.0,2257,france,,"plant-based-foods-and-beverages, plant-based-foods, snacks, salty-snacks, appetizers, meals, prepared-vegetables, cold-starters, vegetables-macedoines",0
6,milk and dairy products,none,abondance,c,52.0,5005,france,,"dairies, fermented-foods, fermented-milk-products, cheeses, french-cheeses, abondance",0
7,cereals and potatoes,none,baguette bressan,b,77.0,5470,france,"wheat-flour, cereal, flour, wheat, cereal-flour, water, salt, yeast, gluten, malted-wheat-flour, deactivated-yeast, e300, filling, acid, modified-tapioca-starch, starch, modified-starch, tapioca, flavouring, colour, bone, chicken-fillet, poultry, chicken, chicken-meat, e326, e262i, e262","plant-based-foods-and-beverages, plant-based-foods, cereals-and-potatoes, breads, baguettes",0
8,fish meat eggs,none,pavé de saumon fumé à la ficelle,e,9.0,5661,france,,"seafood, fishes-and-their-products, fishes, fatty-fishes, salmons, smoked-fishes, smoked-salmons, salmon-steaks",0
9,fish meat eggs,none,corned beef,z,999.0,7160,france,,"canned-foods, meats-and-their-products, meals, meats, meals-with-meat, beef-dishes, canned-meats, corned-beef",0
