In [1]:
import json
import random
import pandas as pd
import jsonlines
import warnings
import plotly.graph_objects as go

pd.set_option('display.max_rows', 100)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [2]:
file_path = '/home/carolus/Documents/school/green_ia/data/00_data/00_openfoodfacts_04.jsonl'
num_samples = 60

In [3]:
def sample_jsonl_file(file_path, num_samples):
    sample_lines = []

    with open(file_path, 'r') as file:
        line_count = 0
        for line in file:
            line_count += 1
            if len(sample_lines) < num_samples:
                sample_lines.append(line)
            else:
                idx = random.randint(0, line_count - 1)
                if idx < num_samples:
                    sample_lines[idx] = line

    data_list = [json.loads(line) for line in sample_lines]
    df = pd.DataFrame(data_list)
    return df

In [4]:
df = sample_jsonl_file(file_path, num_samples)

In [5]:
df.tail(20)

Unnamed: 0,groups,packaging,name,ecoscore_tags,ecoscore_score,code,countries,ingredients,categories,labels_note
40,fish meat eggs,,1/4 jambon sec tranche vpf,e,10.0,2447593057079,france,,"meats-and-their-products, meats, prepared-meat...",2
41,,,hazelnoot pure chocolade,,,8718906105546,netherlands,,,0
42,,,pain nordique,,,203057012464,france,,,0
43,fish meat eggs,,2 filets de poulet blanc,c,45.0,203339033453,france,,"meats-and-their-products, meats, chicken-and-i...",2
44,,,lais frais entier pasteurisé,,,3424140000015,france,,,0
45,fish meat eggs,"frais, sous-vide",les milanaises,c,40.0,217217028530,france,,"meats-and-their-products, meat-preparations, m...",2
46,fruits and vegetables,,chorizo &quot;dulce&quot;,e,2.0,4013593202153,france,"pork-meat, animal, meat, pork, spice-or-bell-p...","plant-based-foods-and-beverages, plant-based-f...",1
47,fat and sauces,,pastasaus,,,8719153024949,netherlands,"tomato, vegetable, fruit-vegetable, lentils, l...","condiments, sauces, pasta-sauces",6
48,,,ckympur,,,3800078010515,italy,,,0
49,,,macarrão com ovos pai nosso barilla pacote 500g,,,7898951850101,brazil,,,0


In [6]:
def load_jsonl_to_dataframe(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    df = pd.DataFrame(data)
    return df

def create_treemap_for_column(df, column_name):
    if column_name not in df.columns:
        print(f"warning, column: '{column_name}'does not exists")
        return
    data = df[column_name].value_counts().reset_index()
    data.columns = [column_name, 'count']
    fig = go.Figure(go.Treemap(
        labels=data[column_name],
        parents=[""] * len(data),
        values=data['count'],
        textinfo='label+value',  
    ))
    fig.update_layout(
        title=f'treemap column: {column_name}',
        paper_bgcolor='black',  
        plot_bgcolor='grey',   
        font_color='white'      
    )
    fig.show()

In [7]:
colonnes_a_travailler = ['ecoscore_tags', 'ecoscore_score', 'countries', 'labels_note']  
df = load_jsonl_to_dataframe(file_path)
for column in colonnes_a_travailler:
    create_treemap_for_column(df, column)

In [14]:
def count_unique_values(jsonl_file_path, colonnes_a_travailler):
    data = []
    with open(jsonl_file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    df = pd.DataFrame(data)
    unique_counts = {}
    for column in colonnes_a_travailler:
        if column in df.columns:
            unique_count = df[column].nunique()
            if pd.api.types.is_numeric_dtype(df[column]):
                min_val = df[column].min()
                max_val = df[column].max()
                unique_counts[column] = {
                    'unique_count': unique_count,
                    'min': min_val,
                    'max': max_val
                }
            else:
                unique_counts[column] = {
                    'unique_count': unique_count
                }
        else:
            unique_counts[column] = 'Column not found in data'
    return unique_counts

In [15]:
unique_values = count_unique_values(file_path, colonnes_a_travailler)
print(unique_values)

{'ecoscore_tags': {'unique_count': 5}, 'ecoscore_score': {'unique_count': 102, 'min': 0.0, 'max': 100.0}, 'countries': {'unique_count': 13521}, 'labels_note': {'unique_count': 10, 'min': 0, 'max': 9}}
