In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys
module_path = os.path.abspath(os.path.join('../../'))
sys.path.insert(0, module_path)

In [None]:
from src.data.data_loader import load_data

# Load the data with specified columns
df = load_data(data_source_name="kaggle_brisT1D", dataset_type="train")

In [None]:
import polars as pl

polars_df = pl.DataFrame(df)

# Melt (unpivot) data for each patient
patient_ids = df["p_num"].unique()
for p_id in patient_ids:
    patient_df = polars_df.filter(pl.col("p_num") == p_id)
    melted_patient_df = (
        patient_df.melt(
            id_vars=["id", "p_num", "time"],
            variable_name="feature_offset",
            value_name="value"
        )
        .with_columns([
            pl.col("feature_offset").str.extract(r"^(\w+)").alias("feature"),
            pl.col("feature_offset").str.extract(r"([+-].+)").alias("offset"),
        ])
        .drop("feature_offset")
    )

    print(melted_patient_df.head())
    melted_patient_df.write_parquet(f"melted_{p_id}.parquet")



In [None]:
import glob

# Clean and pivot melted data
# All features (bg, insulin, etc) in feature col become their own col again
parquet_files = glob.glob("melted_*.parquet")
for file in parquet_files:
    melted_df = pl.read_parquet(file)
    cleaned_df = melted_df.pivot(
        index=["id", "time", "offset"],
        columns="feature",
        values="value",
        aggregate_function="first",
    )
    cleaned_file = file.replace("melted_", "cleaned_")
    cleaned_df.write_parquet(cleaned_file)

In [None]:
# Display cleaned data
parquet_files = glob.glob("cleaned_*.parquet")
for file in parquet_files:
    cleaned_df = pl.read_parquet(file)
    non_empty = cleaned_df.filter(
        pl.col("bg").is_not_null()
    )
    print(f'patient {file.split(".")[0].split("_")[1]}')
    print(non_empty.head(20))
    print(cleaned_df.columns)