In [34]:
import json
import random
import pandas as pd
import jsonlines
import warnings
import plotly.graph_objects as go

pd.set_option('display.max_rows', 100)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [35]:
file_path = '/home/carolus/Documents/school/green_ia/data/01_data/01_train_01.jsonl'
num_samples = 500

In [36]:
def count_objects_in_jsonl(file_path):
    count = 0
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            count += 1
    return count
print(f"nombre objets {file_path}: {count_objects_in_jsonl(file_path)}")


nombre objets /home/carolus/Documents/school/green_ia/data/01_data/01_train_01.jsonl: 905547


In [37]:
def sample_jsonl_file(file_path, num_samples):
    sample_lines = []

    with open(file_path, 'r') as file:
        line_count = 0
        for line in file:
            line_count += 1
            if len(sample_lines) < num_samples:
                sample_lines.append(line)
            else:
                idx = random.randint(0, line_count - 1)
                if idx < num_samples:
                    sample_lines[idx] = line

    data_list = [json.loads(line) for line in sample_lines]
    df = pd.DataFrame(data_list)
    return df

In [38]:
df = sample_jsonl_file(file_path, num_samples)

In [39]:
nb_empty = (df['ecoscore_tags'] == 'empty').sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in ecoscore_tags : {pourcentage_empty:.2f}%")

percetange empty in ecoscore_tags : 0.00%


In [40]:
nb_empty = (df['categories'] == 'empty').sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in categories : {pourcentage_empty:.2f}%")

percetange empty in categories : 19.80%


In [41]:
df.tail(50)

Unnamed: 0,groups,ecoscore_tags,ecoscore_score,countries,categories,labels_note,text_data
450,0.444444,,50.0,186.0,empty,0.0,empty colby jack empty
451,0.666667,,50.0,163.0,"meats-and-their-products, meats, pork-and-its-...",0.0,empty pincho amarillo con perejil empty
452,0.444444,3.0,36.0,60.0,"dairies, fermented-foods, fermented-milk-produ...",0.0,"plástico gouda tierno pasteurised-cow-s-milk, ..."
453,0.444444,,50.0,60.0,empty,0.777778,"étui gésiers confit de volaille duck-fat, oil-..."
454,0.0,,50.0,60.0,"snacks, sweet-snacks, cocoa-and-its-products, ...",0.0,empty schokopralinen empty
455,0.444444,1.0,79.0,60.0,"dairies, fermented-foods, fermented-milk-produ...",0.0,"plastic cup aloe yogurt aloe-vera, vegetable, ..."
456,0.444444,,50.0,46.0,empty,0.0,"empty 1664 blanc 0,0% water, malted-barley, ce..."
457,0.777778,2.0,50.0,60.0,"beverages, alcoholic-beverages, beers, lagers",0.0,"verre, bouteille duchesse anne triple alcohol,..."
458,0.666667,3.0,21.0,186.0,"seafood, fishes-and-their-products, canned-foo...",0.0,"empty chunk light tuna in water light-tuna, wa..."
459,0.333333,0.0,100.0,60.0,"plant-based-foods-and-beverages, plant-based-f...",0.0,"empty polpa di pomodoro tomato-pulp, vegetable..."


In [42]:
def load_jsonl_to_dataframe(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    df = pd.DataFrame(data)
    return df

def create_treemap_for_column(df, column_name):
    if column_name not in df.columns:
        print(f"warning, column: '{column_name}'does not exists")
        return
    data = df[column_name].value_counts().reset_index()
    data.columns = [column_name, 'count']
    fig = go.Figure(go.Treemap(
        labels=data[column_name],
        parents=[""] * len(data),
        values=data['count'],
        textinfo='label+value',  
    ))
    fig.update_layout(
        title=f'treemap column: {column_name}',
        paper_bgcolor='white',  
        plot_bgcolor='white',   
        font_color='black'      
    )
    fig.show()

In [43]:
colonnes_a_travailler = ['ecoscore_tags', 'ecoscore_score', 'countries', 'labels_note', 'groups']  
df = load_jsonl_to_dataframe(file_path)
for column in colonnes_a_travailler:
    create_treemap_for_column(df, column)

In [44]:
file_path_csv = '/home/carolus/Documents/school/green_ia/scripts/validation_predictions.csv'
df_csv = pd.read_csv(file_path_csv)

FileNotFoundError: [Errno 2] No such file or directory: '/home/carolus/Documents/school/green_ia/scripts/validation_predictions.csv'

In [None]:
df_csv.head(60)

In [None]:
df_csv['difference'] = df_csv['true'] - df_csv['predictions']
max_diff = df_csv['difference'].max()
min_diff = df_csv['difference'].min()
mean_diff = df_csv['difference'].mean()  
median_diff = df_csv['difference'].median()  
total_rows = df_csv.shape[0]
df_csv['difference'] = df_csv['true'] - df_csv['predictions']
count_greater_than_2 = (df_csv['difference'].abs() > 2).sum()
percentage = (count_greater_than_2 / total_rows) * 100

print(f"Écart maximum: {max_diff}")
print(f"Écart minimum: {min_diff}")
print(f"Écart moyen: {mean_diff}")
print(f"Écart médian: {median_diff}")
print(f"poucentage lignes écart supérieur à 2% : {percentage}%")