In [10]:
import json
import random
import pandas as pd
import jsonlines
import warnings
import plotly.graph_objects as go

pd.set_option('display.max_rows', 100)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [11]:
file_path = '/home/carolus/Documents/school/green_ia/data/00_data/00_train_01.jsonl'
num_samples = 500

In [12]:
def count_objects_in_jsonl(file_path):
    count = 0
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            count += 1
    return count
print(f"nombre objets {file_path}: {count_objects_in_jsonl(file_path)}")


nombre objets /home/carolus/Documents/school/green_ia/data/00_data/00_train_01.jsonl: 804262


In [13]:
def sample_jsonl_file(file_path, num_samples):
    sample_lines = []

    with open(file_path, 'r') as file:
        line_count = 0
        for line in file:
            line_count += 1
            if len(sample_lines) < num_samples:
                sample_lines.append(line)
            else:
                idx = random.randint(0, line_count - 1)
                if idx < num_samples:
                    sample_lines[idx] = line

    data_list = [json.loads(line) for line in sample_lines]
    df = pd.DataFrame(data_list)
    return df

In [14]:
df = sample_jsonl_file(file_path, num_samples)

In [15]:
nb_empty = (df['ecoscore_tags'] == 'empty').sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in ecoscore_tags : {pourcentage_empty:.2f}%")

percetange empty in ecoscore_tags : 13.40%


In [19]:
nb_empty = (df['packaging'] == 'empty').sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in packaging : {pourcentage_empty:.2f}%")

percetange empty in packaging : 75.64%


In [20]:
nb_empty = (df['categories'] == 'empty').sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in categories : {pourcentage_empty:.2f}%")

percetange empty in categories : 9.63%


In [21]:
nb_empty = (df['ingredients'] == 'empty').sum()
nb_total = len(df)
pourcentage_empty = (nb_empty / nb_total) * 100
print(f"percetange empty in ingredients : {pourcentage_empty:.2f}%")

percetange empty in ingredients : 54.57%


In [16]:
df.tail(50)

Unnamed: 0,groups,packaging,name,ecoscore_tags,ecoscore_score,countries,ingredients,categories,labels_note
450,0.444444,empty,fromage blanc nature,b,71.0,60.0,empty,"dairies, fermented-foods, fermented-milk-produ...",0.0
451,0.777778,empty,jus de fruit à base de concentré orange,b,79.0,60.0,empty,"plant-based-foods-and-beverages, beverages, pl...",0.444444
452,0.222222,"plastique, sachet, frais, sous atmosphère prot...",choucroute au vin,c,59.0,60.0,"sauerkraut, vegetable, brassica, cabbage, whit...","plant-based-foods-and-beverages, plant-based-f...",0.0
453,0.0,empty,facturas,empty,49.0,7.0,"cereal, flour, wheat, cereal-flour, wheat-flou...","snacks, sweet-snacks, biscuits-and-cakes, past...",0.0
454,0.777778,"brique, carton",boisson tropical 6x20cl unité,c,58.0,60.0,"water, fruit-juice-from-concentrate, fruit-jui...","plant-based-foods-and-beverages, beverages, pl...",0.222222
455,0.666667,empty,jambon cuit sup fumé x4,e,12.0,60.0,empty,"meats-and-their-products, meats, prepared-meat...",0.0
456,0.111111,empty,dijon mustard,b,70.0,185.0,"water, mustard-seed, condiment, mustard, spice...","condiments, sauces, mustards, dijon-mustards",1.0
457,0.666667,"plastique, barquette",jambon cru fumé speck,e,0.0,60.0,"ham, animal, meat, pork, pork-meat, salt, spic...","meats-and-their-products, meats, prepared-meat...",0.444444
458,0.0,empty,cookie thins,c,49.0,186.0,empty,"snacks, sweet-snacks, biscuits-and-cakes, bisc...",0.0
459,0.444444,empty,tomillo,b,79.0,163.0,empty,"beverages-and-beverages-preparations, plant-ba...",0.777778


In [17]:
def load_jsonl_to_dataframe(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    df = pd.DataFrame(data)
    return df

def create_treemap_for_column(df, column_name):
    if column_name not in df.columns:
        print(f"warning, column: '{column_name}'does not exists")
        return
    data = df[column_name].value_counts().reset_index()
    data.columns = [column_name, 'count']
    fig = go.Figure(go.Treemap(
        labels=data[column_name],
        parents=[""] * len(data),
        values=data['count'],
        textinfo='label+value',  
    ))
    fig.update_layout(
        title=f'treemap column: {column_name}',
        paper_bgcolor='white',  
        plot_bgcolor='white',   
        font_color='black'      
    )
    fig.show()

In [18]:
colonnes_a_travailler = ['ecoscore_tags', 'ecoscore_score', 'countries', 'labels_note', 'groups']  
df = load_jsonl_to_dataframe(file_path)
for column in colonnes_a_travailler:
    create_treemap_for_column(df, column)