In [73]:
import json
import random
import pandas as pd
import jsonlines
import warnings
import plotly.graph_objects as go

pd.set_option('display.max_rows', 1000)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [55]:
file_path = '/home/carolus/Documents/school/green_ia/data/11_data/11_valid_01.jsonl'
#file_path = '/home/carolus/Documents/school/green_ia/data/01_data/01_openfoodfacts_04.jsonl'
num_samples = 500

In [56]:
def count_objects_in_jsonl(file_path):
    count = 0
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            count += 1
    return count
print(f"nombre objets {file_path}: {count_objects_in_jsonl(file_path)}")


nombre objets /home/carolus/Documents/school/green_ia/data/11_data/11_valid_01.jsonl: 828


In [57]:
def sample_jsonl_file(file_path, num_samples):
    sample_lines = []

    with open(file_path, 'r') as file:
        line_count = 0
        for line in file:
            line_count += 1
            if len(sample_lines) < num_samples:
                sample_lines.append(line)
            else:
                idx = random.randint(0, line_count - 1)
                if idx < num_samples:
                    sample_lines[idx] = line

    data_list = [json.loads(line) for line in sample_lines]
    df = pd.DataFrame(data_list)
    return df

In [58]:
df = sample_jsonl_file(file_path, num_samples)

In [59]:
nb_empty = (df['ecoscore_score'] == 49.5).sum() # remplacer la valeur par la médiane calculée
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in ecoscore_score, replaced by median : {pourcentage_empty:.2f}%")

percetange empty in ecoscore_score, replaced by median : 0.00%


In [77]:
df.tail(60)

Unnamed: 0,groups,ecoscore_tags,ecoscore_score,countries,categories,labels_note,text_data
768,4.0,1,67,60.0,"dairies, fermented-foods, fermented-milk-produ...",0.0,"empty naturéa semi-skimmed-milk, dairy, milk, ..."
769,4.0,1,67,82.0,"desserts, frozen-foods, frozen-desserts, ice-c...",0.0,empty mini coni al cacao e panna empty
770,4.0,1,79,60.0,"desserts, frozen-foods, frozen-desserts, ice-c...",0.333333,"pot, film, green dot, pot plastique, tidyman w..."
771,3.0,1,79,17.0,"plant-based-foods-and-beverages, plant-based-f...",0.0,empty carottes duo empty
772,5.0,1,79,60.0,"plant-based-foods-and-beverages, plant-based-f...",0.555556,empty tofu lactofermenté nature filtered-water...
773,4.0,3,34,186.0,"dairies, fermented-foods, fermented-milk-produ...",0.0,"empty cheesy bliss shreds filtered-water, wate..."
774,7.0,2,47,60.0,"beverages, alcoholic-beverages, wines, wines-f...",0.111111,empty crozes-hermitage empty
775,7.0,0,100,60.0,"plant-based-foods-and-beverages, beverages, pl...",0.777778,"brique, carton, tetra brik boisson soja miel s..."
776,5.0,1,73,60.0,"plant-based-foods-and-beverages, plant-based-f...",0.111111,"plastique, sachet, sous atmosphère protectrice..."
777,5.0,1,62,60.0,"plant-based-foods-and-beverages, plant-based-f...",0.111111,"plastique, pot, gobelet nouilles cuisinées cup..."


In [61]:
def load_jsonl_to_dataframe(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    df = pd.DataFrame(data)
    return df

def create_treemap_for_column(df, column_name):
    if column_name not in df.columns:
        print(f"warning, column: '{column_name}'does not exists")
        return
    data = df[column_name].value_counts().reset_index()
    data.columns = [column_name, 'count']
    fig = go.Figure(go.Treemap(
        labels=data[column_name],
        parents=[""] * len(data),
        values=data['count'],
        textinfo='label+value',  
    ))
    fig.update_layout(
        title=f'treemap column: {column_name}',
        paper_bgcolor='white',  
        plot_bgcolor='white',   
        font_color='black'      
    )
    fig.show()

In [62]:
colonnes_a_travailler = ['ecoscore_tags', 'ecoscore_score', 'countries', 'labels_note', 'groups']  
df = load_jsonl_to_dataframe(file_path)
for column in colonnes_a_travailler:
    create_treemap_for_column(df, column)

In [63]:
nb_empty = df['ecoscore_score'].isna().sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in ecoscore_score before NaN reduction : {pourcentage_empty:.2f}%")

percetange empty in ecoscore_score before NaN reduction : 0.00%


In [64]:
nb_empty = df['countries'].isna().sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in countries before NaN reduction : {pourcentage_empty:.2f}%")

percetange empty in countries before NaN reduction : 0.00%


In [65]:
nb_empty = df['groups'].isna().sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in groups before NaN reduction : {pourcentage_empty:.2f}%")

percetange empty in groups before NaN reduction : 0.00%


In [66]:
nb_empty = df['text_data'].isna().sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in text_data before NaN reduction : {pourcentage_empty:.2f}%")

percetange empty in text_data before NaN reduction : 0.00%


In [67]:
nb_empty = df['labels_note'].isna().sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in labels_note before NaN reduction : {pourcentage_empty:.2f}%")

percetange empty in labels_note before NaN reduction : 0.00%


In [68]:
nb_empty = df['categories'].isna().sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in categories before NaN reduction : {pourcentage_empty:.2f}%")

percetange empty in categories before NaN reduction : 0.00%


In [69]:
file_path_csv = '/home/carolus/Documents/school/green_ia/scripts/validation_predictions.csv'
df_csv = pd.read_csv(file_path_csv)

FileNotFoundError: [Errno 2] No such file or directory: '/home/carolus/Documents/school/green_ia/scripts/validation_predictions.csv'

In [None]:
df_csv.head(60)

In [None]:
df_csv['difference'] = df_csv['true'] - df_csv['predictions']
max_diff = df_csv['difference'].max()
min_diff = df_csv['difference'].min()
mean_diff = df_csv['difference'].mean()  
median_diff = df_csv['difference'].median()  
total_rows = df_csv.shape[0]
df_csv['difference'] = df_csv['true'] - df_csv['predictions']
count_greater_than_2 = (df_csv['difference'].abs() > 2).sum()
percentage = (count_greater_than_2 / total_rows) * 100

print(f"Écart maximum: {max_diff}")
print(f"Écart minimum: {min_diff}")
print(f"Écart moyen: {mean_diff}")
print(f"Écart médian: {median_diff}")
print(f"poucentage lignes écart supérieur à 2% : {percentage}%")