In [74]:
import json
import random
import pandas as pd
import jsonlines
import warnings
import plotly.graph_objects as go

pd.set_option('display.max_rows', 100)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [75]:
file_path = '/home/carolus/Documents/school/green_ia/data/01_data/01_openfoodfacts_04.jsonl'
num_samples = 60

In [76]:
def sample_jsonl_file(file_path, num_samples):
    sample_lines = []

    with open(file_path, 'r') as file:
        line_count = 0
        for line in file:
            line_count += 1
            if len(sample_lines) < num_samples:
                sample_lines.append(line)
            else:
                idx = random.randint(0, line_count - 1)
                if idx < num_samples:
                    sample_lines[idx] = line

    data_list = [json.loads(line) for line in sample_lines]
    df = pd.DataFrame(data_list)
    return df

In [77]:
df = sample_jsonl_file(file_path, num_samples)

In [78]:
df.tail(60)

Unnamed: 0,groups,ingredients_temp,packaging,name,ecoscore_tags,categories_temp,ecoscore_score,labels_temp,countries
0,unknown,,none,Shamsy halal ravioli,,,,[en:halal],en:fr
1,Cereals and potatoes,,,Ceci,0.25,"[en:plant-based-foods-and-beverages, en:plant-...",79.0,"[en:organic, en:eu-organic, en:it-bio-007]",Italy
2,Fruits and vegetables,"[en:carrot, en:vegetable, en:root-vegetable, e...",plastique,Légumes du marché Carottes/Haricots Plats/Cham...,0.0,"[en:plant-based-foods-and-beverages, en:plant-...",,[],France
3,Fruits and vegetables,[en:pinto-beans-frijoles-pintos],none,Pinto Beans,,"[en:plant-based-foods-and-beverages, en:plant-...",,,United States
4,unknown,,none,Tavaillon de Savoie,,,,,France
5,Fish Meat Eggs,"[fr:calamares, fr:aceite-de-girasol, en:tomato...",,Calamares en tinta,,"[en:seafood, en:mollusc, en:calamari]",,[],Espagne
6,Cereals and potatoes,"[en:wheat-flour, en:cereal, en:flour, en:wheat...",,Zelfrijzend bakmeel,0.0,"[en:plant-based-foods-and-beverages, en:plant-...",91.0,[],Nederland
7,Fruits and vegetables,,none,abricot sec,0.5,"[en:plant-based-foods-and-beverages, en:plant-...",44.0,,en:fr
8,Fat and sauces,,none,Sugo di anatra,,"[en:condiments, en:sauces, en:pasta-sauces]",,,en:it
9,unknown,,none,La pose gourmande Assortiment de nougats,,,,,en:fr


In [79]:
df.describe()

Unnamed: 0,ecoscore_tags,ecoscore_score
count,18.0,16.0
mean,0.430556,45.8125
std,0.340907,30.507308
min,0.0,-20.0
25%,0.25,29.25
50%,0.25,53.5
75%,0.6875,62.5
max,1.0,91.0


In [80]:
def load_jsonl_to_dataframe(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    df = pd.DataFrame(data)
    return df

def create_treemap_for_column(df, column_name):
    if column_name not in df.columns:
        print(f"warning, column: '{column_name}'does not exists")
        return
    data = df[column_name].value_counts().reset_index()
    data.columns = [column_name, 'count']
    fig = go.Figure(go.Treemap(
        labels=data[column_name],
        parents=[""] * len(data),
        values=data['count'],
        textinfo='label+value',  
    ))
    fig.update_layout(
        title=f'treemap column: {column_name}',
        paper_bgcolor='black',  
        plot_bgcolor='grey',   
        font_color='white'      
    )
    fig.show()

In [81]:
colonnes_a_travailler = ['ecoscore_tags', 'ecoscore_score', 'countries', 'labels_note']  
df = load_jsonl_to_dataframe(file_path)
for column in colonnes_a_travailler:
    create_treemap_for_column(df, column)



In [82]:
def count_unique_values(jsonl_file_path, colonnes_a_travailler):
    data = []
    with open(jsonl_file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    df = pd.DataFrame(data)
    unique_counts = {}
    for column in colonnes_a_travailler:
        if column in df.columns:
            unique_count = df[column].nunique()
            if pd.api.types.is_numeric_dtype(df[column]):
                min_val = df[column].min()
                max_val = df[column].max()
                unique_counts[column] = {
                    'unique_count': unique_count,
                    'min': min_val,
                    'max': max_val
                }
            else:
                unique_counts[column] = {
                    'unique_count': unique_count
                }
        else:
            unique_counts[column] = 'Column not found in data'
    return unique_counts

In [83]:
unique_values = count_unique_values(file_path, colonnes_a_travailler)
print(unique_values)

{'ecoscore_tags': {'unique_count': 7, 'min': 0.0, 'max': 1.0}, 'ecoscore_score': {'unique_count': 155, 'min': -30.0, 'max': 125.0}, 'countries': {'unique_count': 17591}, 'labels_note': 'Column not found in data'}
