In [54]:
import json
import random
import pandas as pd
import jsonlines
import warnings
import plotly.graph_objects as go

pd.set_option('display.max_rows', 100)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [55]:
file_path = '/home/carolus/Documents/school/green_ia/data/01_data/01_openfoodfacts_04.jsonl'
num_samples = 60

In [56]:
def sample_jsonl_file(file_path, num_samples):
    sample_lines = []

    with open(file_path, 'r') as file:
        line_count = 0
        for line in file:
            line_count += 1
            if len(sample_lines) < num_samples:
                sample_lines.append(line)
            else:
                idx = random.randint(0, line_count - 1)
                if idx < num_samples:
                    sample_lines[idx] = line

    data_list = [json.loads(line) for line in sample_lines]
    df = pd.DataFrame(data_list)
    return df

In [57]:
df = sample_jsonl_file(file_path, num_samples)

In [58]:
df.tail(60)

Unnamed: 0,groups,packaging,name,ecoscore_tags,ecoscore_score,countries,ingredients,categories,labels_note
0,,none,confiture de cidre au safran,,,ireland,none,none,0.0
1,,none,caldo de pollo,,,spain,none,none,0.0
2,cereals and potatoes,none,bio basil tofu,b,77.0,switzerland,none,plantbasedfoods plantbasedfoodsandbeverages,0.444444
3,,none,weißer storch,,,germany,none,none,0.0
4,,none,ravioli ricotta et champignons,,,france,none,none,0.222222
5,,none,rôti de longe de porc à l'ancienne,,,canada,none,none,0.0
6,,none,microwave popcorn,,,belgium,none,none,0.0
7,,none,poulet blanc roti ramon,,,france,none,none,0.0
8,fat and sauces,,sauce bolognaise bio,d,33.0,france,vegetable sugar disaccharide addedsugar salt,,0.333333
9,,,plant-based milk alternative - chocolate,,,"united states, world",none,none,0.222222


In [59]:
df.describe()

Unnamed: 0,ecoscore_score,labels_note
count,16.0,60.0
mean,48.9375,0.085185
std,25.590281,0.179938
min,14.0,0.0
25%,31.75,0.0
50%,38.5,0.0
75%,77.25,0.111111
max,79.0,1.0


In [60]:
def load_jsonl_to_dataframe(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    df = pd.DataFrame(data)
    return df

def create_treemap_for_column(df, column_name):
    if column_name not in df.columns:
        print(f"warning, column: '{column_name}'does not exists")
        return
    data = df[column_name].value_counts().reset_index()
    data.columns = [column_name, 'count']
    fig = go.Figure(go.Treemap(
        labels=data[column_name],
        parents=[""] * len(data),
        values=data['count'],
        textinfo='label+value',  
    ))
    fig.update_layout(
        title=f'treemap column: {column_name}',
        paper_bgcolor='black',  
        plot_bgcolor='grey',   
        font_color='white'      
    )
    fig.show()

In [61]:
colonnes_a_travailler = ['ecoscore_tags', 'ecoscore_score', 'countries', 'labels_note']  
df = load_jsonl_to_dataframe(file_path)
for column in colonnes_a_travailler:
    create_treemap_for_column(df, column)

In [None]:
def count_unique_values(jsonl_file_path, colonnes_a_travailler):
    data = []
    with open(jsonl_file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    df = pd.DataFrame(data)
    unique_counts = {}
    for column in colonnes_a_travailler:
        if column in df.columns:
            unique_count = df[column].nunique()
            if pd.api.types.is_numeric_dtype(df[column]):
                min_val = df[column].min()
                max_val = df[column].max()
                unique_counts[column] = {
                    'unique_count': unique_count,
                    'min': min_val,
                    'max': max_val
                }
            else:
                unique_counts[column] = {
                    'unique_count': unique_count
                }
        else:
            unique_counts[column] = 'Column not found in data'
    return unique_counts

In [None]:
unique_values = count_unique_values(file_path, colonnes_a_travailler)
print(unique_values)

{'ecoscore_tags': {'unique_count': 5}, 'ecoscore_score': {'unique_count': 102, 'min': 0.0, 'max': 100.0}, 'countries': {'unique_count': 13521}, 'labels_note': {'unique_count': 10, 'min': 0, 'max': 9}}
