In [1]:
import json
import random
import pandas as pd
import jsonlines
import warnings
import plotly.graph_objects as go

pd.set_option('display.max_rows', 100)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [2]:
file_path = '/home/carolus/Documents/school/green_ia/data/07_data/07_train_01.jsonl'
num_samples = 500

In [3]:
def count_objects_in_jsonl(file_path):
    count = 0
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            count += 1
    return count
print(f"nombre objets {file_path}: {count_objects_in_jsonl(file_path)}")


nombre objets /home/carolus/Documents/school/green_ia/data/07_data/07_train_01.jsonl: 705325


In [4]:
def sample_jsonl_file(file_path, num_samples):
    sample_lines = []

    with open(file_path, 'r') as file:
        line_count = 0
        for line in file:
            line_count += 1
            if len(sample_lines) < num_samples:
                sample_lines.append(line)
            else:
                idx = random.randint(0, line_count - 1)
                if idx < num_samples:
                    sample_lines[idx] = line

    data_list = [json.loads(line) for line in sample_lines]
    df = pd.DataFrame(data_list)
    return df

In [5]:
df = sample_jsonl_file(file_path, num_samples)

In [6]:
nb_empty = (df['ecoscore_tags'] == 'empty').sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in ecoscore_tags : {pourcentage_empty:.2f}%")

percetange empty in ecoscore_tags : 0.40%


In [7]:
nb_empty = (df['packaging'] == 'empty').sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in packaging : {pourcentage_empty:.2f}%")

percetange empty in packaging : 74.60%


In [8]:
nb_empty = (df['categories'] == 'empty').sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in categories : {pourcentage_empty:.2f}%")

percetange empty in categories : 0.40%


In [9]:
nb_empty = (df['ingredients'] == 'empty').sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in ingredients : {pourcentage_empty:.2f}%")

percetange empty in ingredients : 49.60%


In [10]:
df.tail(50)

Unnamed: 0,groups,packaging,name,ecoscore_tags,ecoscore_score,countries,ingredients,categories,labels_note
450,0.666667,empty,original graved-lachs,c,47.0,60.0,empty,"seafood, fishes-and-their-products, fishes, fa...",0.222222
451,0.0,empty,honey,b,72.0,9.0,empty,"breakfasts, spreads, sweet-spreads, bee-produc...",0.0
452,0.444444,empty,хималайска сол,b,78.0,26.0,empty,"condiments, salts, himalaya-salts",0.0
453,0.111111,empty,curry sauce - colruyt,b,76.0,101.0,"water, colza-oil, oil-and-fat, vegetable-oil-a...","condiments, sauces, curry-sauces",0.333333
454,0.666667,empty,jambon serrano grande reserve 18 mois,e,3.0,60.0,"ham, animal, meat, pork, pork-meat, salt, gluc...","meats-and-their-products, prepared-meats, hams...",0.111111
455,0.333333,empty,sweet peas,e,12.0,186.0,"pea, vegetable, legume, pod-and-seed-vegetable","plant-based-foods-and-beverages, plant-based-f...",0.0
456,0.0,empty,honey,b,72.0,186.0,empty,"breakfasts, spreads, sweet-spreads, bee-produc...",0.0
457,0.555556,"plastique, sachet, carton",nesquik,e,19.0,60.0,"cereal, rice-flour, flour, rice, whole-wheat, ...","plant-based-foods-and-beverages, plant-based-f...",0.222222
458,0.555556,empty,none,b,67.0,3.0,empty,"plant-based-foods-and-beverages, plant-based-f...",0.0
459,0.666667,empty,saucisson sec aux olives noires,e,16.0,60.0,empty,"meats-and-their-products, meats, prepared-meat...",0.0


In [11]:
def load_jsonl_to_dataframe(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    df = pd.DataFrame(data)
    return df

def create_treemap_for_column(df, column_name):
    if column_name not in df.columns:
        print(f"warning, column: '{column_name}'does not exists")
        return
    data = df[column_name].value_counts().reset_index()
    data.columns = [column_name, 'count']
    fig = go.Figure(go.Treemap(
        labels=data[column_name],
        parents=[""] * len(data),
        values=data['count'],
        textinfo='label+value',  
    ))
    fig.update_layout(
        title=f'treemap column: {column_name}',
        paper_bgcolor='white',  
        plot_bgcolor='white',   
        font_color='black'      
    )
    fig.show()

In [12]:
colonnes_a_travailler = ['ecoscore_tags', 'ecoscore_score', 'countries', 'labels_note', 'groups']  
df = load_jsonl_to_dataframe(file_path)
for column in colonnes_a_travailler:
    create_treemap_for_column(df, column)

In [13]:
file_path_csv = '/home/carolus/Documents/school/green_ia/scripts/validation_predictions.csv'
df_csv = pd.read_csv(file_path_csv)

In [14]:
df_csv.head(60)

Unnamed: 0,labels_note,countries,groups,true,predictions
0,0.444444,163.0,0.555556,77.0,11.471726
1,0.0,64.0,0.666667,19.0,31.03798
2,0.0,60.0,0.777778,4.0,1.276613
3,0.111111,60.0,0.666667,16.0,49.165802
4,0.0,186.0,0.444444,34.0,47.976227
5,0.0,186.0,0.777778,44.0,5.063185
6,0.111111,60.0,0.666667,12.0,41.708782
7,0.0,60.0,0.0,57.0,79.762695
8,0.222222,60.0,0.444444,51.0,65.22209
9,0.0,60.0,0.666667,6.0,36.240955


In [16]:
df_csv['difference'] = df_csv['true'] - df_csv['predictions']
max_diff = df_csv['difference'].max()
min_diff = df_csv['difference'].min()
mean_diff = df_csv['difference'].mean()  
median_diff = df_csv['difference'].median()  
total_rows = df_csv.shape[0]
df_csv['difference'] = df_csv['true'] - df_csv['predictions']
count_greater_than_2 = (df_csv['difference'].abs() > 2).sum()
percentage = (count_greater_than_2 / total_rows) * 100

print(f"Écart maximum: {max_diff}")
print(f"Écart minimum: {min_diff}")
print(f"Écart moyen: {mean_diff}")
print(f"Écart médian: {median_diff}")
print(f"poucentage lignes écart supérieur à 2% : {percentage}%")

Écart maximum: 91.885288
Écart minimum: -70.89432
Écart moyen: 15.38256814178744
Écart médian: 15.323057
poucentage lignes écart supérieur à 2% : 96.73913043478261%
