In [4]:
import json
import random
import pandas as pd
import jsonlines
import warnings
import plotly.graph_objects as go

pd.set_option('display.max_rows', 100)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [5]:
file_path = '/home/carolus/Documents/school/green_ia/data/05_data/05_train_01.jsonl'
num_samples = 500

In [6]:
def count_objects_in_jsonl(file_path):
    count = 0
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            count += 1
    return count
print(f"nombre objets {file_path}: {count_objects_in_jsonl(file_path)}")


nombre objets /home/carolus/Documents/school/green_ia/data/05_data/05_train_01.jsonl: 804406


In [7]:
def sample_jsonl_file(file_path, num_samples):
    sample_lines = []

    with open(file_path, 'r') as file:
        line_count = 0
        for line in file:
            line_count += 1
            if len(sample_lines) < num_samples:
                sample_lines.append(line)
            else:
                idx = random.randint(0, line_count - 1)
                if idx < num_samples:
                    sample_lines[idx] = line

    data_list = [json.loads(line) for line in sample_lines]
    df = pd.DataFrame(data_list)
    return df

In [8]:
df = sample_jsonl_file(file_path, num_samples)

In [9]:
nb_empty = (df['ecoscore_tags'] == 'empty').sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in ecoscore_tags : {pourcentage_empty:.2f}%")

percetange empty in ecoscore_tags : 13.60%


In [10]:
nb_empty = (df['packaging'] == 'empty').sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in packaging : {pourcentage_empty:.2f}%")

percetange empty in packaging : 77.00%


In [11]:
nb_empty = (df['categories'] == 'empty').sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in categories : {pourcentage_empty:.2f}%")

percetange empty in categories : 10.80%


In [12]:
nb_empty = (df['ingredients'] == 'empty').sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in ingredients : {pourcentage_empty:.2f}%")

percetange empty in ingredients : 57.80%


In [13]:
df.tail(50)

Unnamed: 0,groups,packaging,name,ecoscore_tags,ecoscore_score,countries,ingredients,categories,labels_note
450,0.555556,empty,riz basmati,e,0.0,32.0,empty,"plant-based-foods-and-beverages, plant-based-f...",0.0
451,0.111111,empty,salsa barbecue,b,67.0,82.0,"tomato-concentrate, vegetable, fruit-vegetable...","condiments, sauces, barbecue-sauces",0.111111
452,0.444444,bote de plastico,vitalinea sin azucar sabor manzana verde,b,79.0,111.0,"pasteurized-skimmed-milk, dairy, milk, pasteur...","dairies, fermented-foods, fermented-milk-produ...",0.0
453,0.444444,empty,cœur de liégeois cappuccino cœur chocolat,c,53.0,60.0,"skimmed-milk-powder, dairy, milk-powder","dairies, desserts, dairy-desserts, coffee-dess...",0.0
454,0.333333,empty,nams roterno,b,77.0,59.0,empty,"plant-based-foods-and-beverages, plant-based-f...",0.0
455,0.666667,conserve,gésiers de canard,e,15.0,60.0,empty,"meats-and-their-products, meats, fish-and-meat...",0.0
456,0.777778,empty,herbal tea,c,43.0,186.0,"water, sugar, added-sugar, disaccharide, meson...","plant-based-foods-and-beverages, beverages, ho...",0.0
457,0.222222,"plastic, cardboard, tray, opercule en plastiqu...",risotto poulet champignon,b,66.0,60.0,"water, poultry, chicken, chicken-meat, cultiva...","meats-and-their-products, frozen-foods, meals,...",0.222222
458,0.444444,empty,plant-bases falafel burger,b,74.0,168.0,empty,"meat-alternatives, meat-analogues, vegetarian-...",0.444444
459,0.666667,empty,aiguillettes de poulet,c,45.0,60.0,empty,"meats-and-their-products, meats, chicken-and-i...",0.222222


In [14]:
def load_jsonl_to_dataframe(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    df = pd.DataFrame(data)
    return df

def create_treemap_for_column(df, column_name):
    if column_name not in df.columns:
        print(f"warning, column: '{column_name}'does not exists")
        return
    data = df[column_name].value_counts().reset_index()
    data.columns = [column_name, 'count']
    fig = go.Figure(go.Treemap(
        labels=data[column_name],
        parents=[""] * len(data),
        values=data['count'],
        textinfo='label+value',  
    ))
    fig.update_layout(
        title=f'treemap column: {column_name}',
        paper_bgcolor='white',  
        plot_bgcolor='white',   
        font_color='black'      
    )
    fig.show()

In [15]:
colonnes_a_travailler = ['ecoscore_tags', 'ecoscore_score', 'countries', 'labels_note', 'groups']  
df = load_jsonl_to_dataframe(file_path)
for column in colonnes_a_travailler:
    create_treemap_for_column(df, column)

In [18]:
file_path_csv = '/home/carolus/Documents/school/green_ia/data/05_data/05_validation_data.csv'
df_csv = pd.read_csv(file_path_csv)

In [19]:
df_csv.head(60)

Unnamed: 0,groups,packaging,name,ecoscore_tags,ecoscore_score,countries,ingredients,categories,labels_note,predictions
0,0.0,"boîte, carton, surgelé, barquette, etui en car...",cheesecake au citron,b,62,60.0,"cream-cheese, dairy, cheese, sugar, added-suga...","snacks, desserts, sweet-snacks, frozen-foods, ...",0.333333,64.448891
1,0.444444,bocal en verre,nescafe gold decaf,e,0,17.0,empty,"plant-based-foods-and-beverages, beverages, pl...",0.0,-2.446124
2,0.777778,plastic,vegan drink vanille,c,52,64.0,"water, oat-base, sugar, added-sugar, disacchar...","plant-based-foods-and-beverages, beverages, pl...",0.444444,57.680958
3,0.111111,empty,aceite de oliva virgen extra,d,27,163.0,empty,"plant-based-foods-and-beverages, plant-based-f...",0.0,27.488888
4,0.555556,empty,"grissini croccanti, olio extravergine d'oliva",b,74,60.0,"extra-virgin-olive-oil, oil-and-fat, vegetable...","plant-based-foods-and-beverages, plant-based-f...",0.111111,73.921745
5,0.111111,"papier, plaquette",beurre doux 82%mg,b,64,60.0,"cream, dairy, ferment, milk","dairies, fats, spreads, spreadable-fats, anima...",0.666667,44.970097
6,0.666667,"plastique, barquette, film en plastique",l'aiguillette de poulet,c,49,60.0,"poultry, chicken, chicken-meat","meats-and-their-products, meats, chicken-and-i...",0.333333,48.32909
7,0.444444,empty,thé matcha,b,79,60.0,empty,"plant-based-foods-and-beverages, beverages, ho...",0.222222,80.835548
8,0.444444,busta,santa lucia mozzarella light tris,c,55,82.0,"semi-skimmed-milk, dairy, milk, salt, rennet, ...","dairies, fermented-foods, fermented-milk-produ...",0.111111,64.820564
9,0.111111,empty,passata di pomodoro extra raccolto a mano,b,72,82.0,empty,"condiments, sauces, tomato-sauces",0.0,73.577599


In [30]:
df_csv['difference'] = df_csv['ecoscore_score'] - df_csv['predictions']
max_diff = df_csv['difference'].max()
min_diff = df_csv['difference'].min()
mean_diff = df_csv['difference'].mean()  
median_diff = df_csv['difference'].median()  
total_rows = df_csv.shape[0]
df_csv['difference'] = df_csv['ecoscore_score'] - df_csv['predictions']
count_greater_than_2 = (df_csv['difference'].abs() > 2).sum()
percentage = (count_greater_than_2 / total_rows) * 100

print(f"Écart maximum: {max_diff}")
print(f"Écart minimum: {min_diff}")
print(f"Écart moyen: {mean_diff}")
print(f"Écart médian: {median_diff}")
print(f"poucentage lignes écart supérieur à 2% : {percentage}%")

Écart maximum: 29.152313232421875
Écart minimum: -56.77560806274414
Écart moyen: 0.0012149977439266852
Écart médian: 0.0021228790283203125
poucentage lignes écart supérieur à 2% : 50.96618357487923%
