In [1]:
import pandas as pd
import numpy as np
import json
import os
import warnings
from datetime import datetime 
from collections import OrderedDict
from collections import Counter
import plotly.express as px
from collections import defaultdict
import plotly.graph_objects as go


pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [2]:
chunk_size = 1000
file_id = '01'
project_path = "/home/carolus/Documents/school/green_ia/" 
jsonl_02 = project_path + 'data/' + file_id + '_openfoodfacts_02.jsonl' 

# récupérer la date du jour 
current_date_time = datetime.now()
date_format = "%d/%m/%Y %H:%M:%S.%f"
start_date = current_date_time.strftime("%d/%m/%Y %H:%M:%S.%f")[:-3]
date_code = current_date_time.strftime('%d%m%Y%H%M%S') + f"{current_date_time.microsecond // 1000:03d}"

In [3]:
def add_logs(logData):
    print(logData)
    #with open(f"{project_path}logs/03_analysis_{date_code}_logs.txt", "a") as logFile:
     #   logFile.write(f'{logData}\n')

In [4]:
# verifie la validité de la structure du fichier jsonl
with open(jsonl_02, 'r') as file:
    for line in file:
        try:
            json_object = json.loads(line)
        except json.JSONDecodeError as e:
            add_logs(f"ERROR decoding jsonl: {e}")

add_logs(f"jsonl format valid: {jsonl_02}")

jsonl format valid: /home/carolus/Documents/school/green_ia/data/01_openfoodfacts_02.jsonl


In [5]:
# retourne une liste des pays présents dans le fichier
def extract_countries_from_jsonl(file_path):
    countries = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                country = record.get('countries')
                if country is not None:
                    countries.append(country)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")    
    return countries
countries_list = extract_countries_from_jsonl(jsonl_02)

separated_countries = []
for entry in countries_list:
    countries = [country.strip() for country in entry.split(',')]
    separated_countries.extend(countries)

country_counts = Counter(separated_countries)
total_countries = sum(country_counts.values())

for country, count in country_counts.items():
    percentage = (count / total_countries) * 100
    print(f"{country}: {percentage:.2f}%")

italy: 7.06%
germany: 7.84%
france: 30.16%
united states: 18.80%
irlande: 0.01%
united kingdom: 3.29%
canada: 2.56%
switzerland: 2.66%
spain: 8.99%
slovenia: 0.03%
netherlands: 0.59%
romania: 0.22%
australia: 1.31%
world: 2.02%
guadeloupe: 0.02%
dominican republic: 0.01%
bolivia: 0.10%
mexico: 0.23%
new zealand: 0.28%
sweden: 0.23%
antarctic: 0.00%
belgium: 2.39%
denmark: 0.10%
norway: 0.23%
poland: 0.39%
none: 0.46%
india: 0.20%
austria: 0.36%
francia: 0.00%
hungary: 0.10%
czech republic: 0.22%
thailand: 0.14%
egypt: 0.03%
palestinian territories: 0.01%
ireland: 1.74%
japan: 0.31%
réunion: 0.04%
croatia: 0.09%
taiwan: 0.01%
luxembourg: 0.28%
argentina: 0.19%
morocco: 0.22%
brazil: 0.49%
guatemala: 0.01%
saudi arabia: 0.14%
united arab emirates: 0.04%
iraq: 0.02%
kuwait: 0.01%
costa rica: 0.02%
moldova: 0.00%
martinique: 0.02%
new caledonia: 0.07%
singapore: 0.12%
french polynesia: 0.05%
panama: 0.02%
russia: 0.39%
finland: 0.20%
made in canada from domestic and imported ingredients.: 

In [6]:
# retourne une liste des notes écoscores présentes dans le fichier
def extract_grad_ecoscore_from_jsonl(file_path):
    ecoscore_grad_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                ecoscore_grad = record.get('ecoscore_note')
                if ecoscore_grad is not None:
                    ecoscore_grad_list.append(ecoscore_grad)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")
    return ecoscore_grad_list
ecoscore_grad_list = extract_grad_ecoscore_from_jsonl(jsonl_02)

ecoscore_grad_counts = Counter(ecoscore_grad_list)
total_ecoscore_grads = sum(ecoscore_grad_counts.values())
data_graph_ecoscore_grad = []
for ecoscore_grad, count in ecoscore_grad_counts.items():
    percentage = (count / total_ecoscore_grads) * 100
    print(f"{ecoscore_grad}: {percentage:.2f}%")
    data_graph_ecoscore_grad.append({'ecoscore_grad': ecoscore_grad, 'percentage': percentage})

999.0: 74.61%
54.0: 0.35%
75.0: 0.48%
76.0: 0.38%
77.0: 0.51%
52.0: 0.17%
9.0: 0.14%
24.0: 0.15%
50.0: 0.43%
65.0: 0.16%
49.0: 0.65%
70.0: 0.29%
79.0: 1.84%
63.0: 0.22%
29.0: 0.21%
21.0: 0.50%
56.0: 0.17%
92.0: 0.04%
34.0: 0.92%
59.0: 0.21%
55.0: 0.23%
73.0: 0.22%
27.0: 0.45%
19.0: 0.32%
60.0: 0.38%
23.0: 0.17%
47.0: 0.21%
42.0: 0.46%
-4.0: 0.01%
41.0: 0.17%
67.0: 1.32%
71.0: 0.27%
74.0: 0.31%
30.0: 0.27%
15.0: 0.43%
58.0: 0.26%
61.0: 0.15%
39.0: 0.51%
91.0: 0.07%
53.0: 0.19%
44.0: 0.38%
14.0: 0.10%
72.0: 0.99%
68.0: 0.28%
12.0: 0.33%
18.0: 0.20%
5.0: 0.10%
51.0: 0.22%
-8.0: 0.02%
66.0: 0.22%
69.0: 0.18%
43.0: 0.39%
-9.0: 0.04%
-16.0: 0.03%
8.0: 0.10%
2.0: 0.12%
22.0: 0.26%
46.0: 0.13%
38.0: 0.31%
26.0: 0.13%
10.0: 0.04%
36.0: 0.36%
37.0: 0.39%
78.0: 0.36%
25.0: 0.34%
64.0: 0.24%
4.0: 0.19%
40.0: 0.20%
35.0: 0.31%
20.0: 0.11%
32.0: 0.20%
85.0: 0.08%
62.0: 0.22%
31.0: 0.16%
28.0: 0.19%
89.0: 0.04%
48.0: 0.21%
83.0: 0.06%
-1.0: 0.03%
16.0: 0.27%
90.0: 0.05%
57.0: 0.24%
45.0: 0.33%
112.0:

In [7]:
df = pd.DataFrame(data_graph_ecoscore_grad)
fig = px.treemap(df, path=['ecoscore_grad'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='ecoscore grad treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

In [8]:
# retourne une liste des lettres écoscore présentes dans le fichier
def extract_groups_ecoscore_from_jsonl(file_path):
    ecoscore_groups_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                ecoscore_groups = record.get('ecoscore_groups')
                if ecoscore_groups is not None:
                    ecoscore_groups_list.append(ecoscore_groups)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")
    return ecoscore_groups_list
ecoscore_groups_list = extract_groups_ecoscore_from_jsonl(jsonl_02)

ecoscore_groups_counts = Counter(ecoscore_groups_list)
total_ecoscore_groups = sum(ecoscore_groups_counts.values())

data_graph_ecoscore_groups = []
for ecoscore_group, count in ecoscore_groups_counts.items():
    percentage = (count / total_ecoscore_groups) * 100
    print(f"{ecoscore_group}: {percentage:.2f}%")
    data_graph_ecoscore_groups.append({'ecoscore_group': ecoscore_group, 'percentage': percentage})


z: 74.61%
c: 5.59%
b: 9.01%
e: 3.51%
d: 6.06%
a: 1.21%


In [9]:
df = pd.DataFrame(data_graph_ecoscore_groups)
fig = px.treemap(df, path=['ecoscore_group'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='ecoscore groups treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

In [10]:
# retourne une liste des labels présents dans le fichier (sans doublons dans l'affichage)
def extract_labels_from_jsonl(file_path):
    labels_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                record = json.loads(line)
                labels = record.get('labels_note')
                if labels is not None:
                    labels_list.append(labels)
            except json.JSONDecodeError:
                print(f"WARNING line: {line} in {file_path}")
    return labels_list

labels_list = extract_labels_from_jsonl(jsonl_02)
label_counts = Counter(labels_list)
total_labels = sum(label_counts.values())

data_graph_labels = []
for label, count in label_counts.items():
    percentage = (count / total_labels) * 100
    print(f"{label}: {percentage:.2f}%")
    data_graph_labels.append({'labels': label, 'percentage': percentage})

0: 70.95%
6: 0.88%
1: 11.32%
5: 1.11%
2: 8.13%
3: 4.34%
4: 2.05%
8: 0.29%
7: 0.47%
10: 0.11%
9: 0.18%
11: 0.07%
12: 0.04%
14: 0.02%
13: 0.02%
19: 0.00%
15: 0.01%
17: 0.00%
16: 0.01%
25: 0.00%
24: 0.00%
28: 0.00%
23: 0.00%
18: 0.00%
21: 0.00%
20: 0.00%
29: 0.00%
22: 0.00%
27: 0.00%
26: 0.00%
36: 0.00%


In [11]:
df = pd.DataFrame(data_graph_labels)
fig = px.treemap(df, path=['labels'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='labels treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

In [12]:
def count_none_and_total_values(jsonl_file_path):
    none_counts = defaultdict(int)
    total_counts = defaultdict(int)
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            for key, value in data.items():
                total_counts[key] += 1
                if value is None:
                    none_counts[key] += 1
    return none_counts, total_counts

def calculate_percentage(none_counts, total_counts):
    percentages = {}
    for key in none_counts:
        if total_counts[key] > 0:
            percentage = (none_counts[key] / total_counts[key]) * 100
        else:
            percentage = 0
        percentages[key] = percentage
    return percentages

none_counts, total_counts = count_none_and_total_values(jsonl_02)
percentages = calculate_percentage(none_counts, total_counts)

In [13]:
data_graph_none = [{'key': key, 'percentage': percentage} for key, percentage in percentages.items()]
df = pd.DataFrame(data_graph_none)
fig = px.treemap(df, path=['key'], values='percentage',
                 color='percentage', color_continuous_scale='Magma',
                 title='none treemap')
fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font_color='white'      
)
fig.show()

In [14]:
def count_specific_values(jsonl_file_path):
    counts = {
        'ecoscore_groups': {'z': 0},
        'ecoscore_note': {999: 0}
    }
    total_counts = {
        'ecoscore_groups': 0,
        'ecoscore_note': 0
    }
    with open(jsonl_file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            for key, value in data.items():
                if key in counts:
                    total_counts[key] += 1
                    if key == 'ecoscore_groups' and value == 'z':
                        counts[key]['z'] += 1
                    if key == 'ecoscore_note' and value == 999:
                        counts[key][999] += 1
    return counts, total_counts

def calculate_percentage(count, total):
    if total > 0:
        percentage = (count / total) * 100
    else:
        percentage = 0
    return percentage

counts, total_counts = count_specific_values(jsonl_02)

z_percentage = calculate_percentage(counts['ecoscore_groups']['z'], total_counts['ecoscore_groups'])
number_999 = counts['ecoscore_note'][999]
number_999_percentage = calculate_percentage(number_999, total_counts['ecoscore_note'])

labels = ['Ecoscore Groups (z)', 'Ecoscore Note (999)']
values = [z_percentage, number_999_percentage]
counts_values = [counts['ecoscore_groups']['z'], counts['ecoscore_note'][999]]

In [15]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=labels,
    y=values,
    name='Pourcentage',
    text=[f'{v:.2f}%' for v in values],
    textposition='auto'
))
fig.add_trace(go.Bar(
    x=labels,
    y=counts_values,
    name='Nombre',
    text=[str(v) for v in counts_values],
    textposition='auto'
))
fig.update_layout(
    title='percentage empty data ecoscore group and grad',
    xaxis_title='Catégories',
    yaxis_title='Valeurs',
    barmode='group'
)
fig.show()