## Process the raw data

TODO: Need to figure out how to load the iterable dataset directly into the Trainer. It needs to know there are different patients in there too.

Sample examples: </br>
-- SELECT DISTINCT gid, p_num FROM groups ORDER BY gid</br>

SELECT DISTINCT g.gid </br>
FROM groups g </br>
WHERE EXISTS (SELECT 1 FROM readings r WHERE r.gid = g.gid) </br>
   OR EXISTS (SELECT 1 FROM messages m WHERE m.gid = g.gid);</br> 

-- "H9J8/xnp4ibw9mHc90Jxe1qujl" -- This doesn't have enough data </br>
-- "iju21PLZgGO0bA8ZH2KqqT46Kk" </br>
-- "5n/9DRBf5q8a+lSPmBzBGdeq3A" </br>
-- "pcguxJKN6qt2Qs9JjwWsd0fLiU" </br>
-- "hoqyr8hYzab9weMm3IoxI2demV" </br>

-- select * from messages </br>
-- where gid = 'H9J8/xnp4ibw9mHc90Jxe1qujl'</br>
-- order by date desc </br>


In [9]:
from src.data.diabetes_datasets.data_loader import get_loader

loader = get_loader(
    data_source_name="gluroo",
    use_cached=True,
    config=None,
    load_all=True,  # this is only temporary (this load all data to processed_data, all at once)
    max_workers=10,
)
loader.processed_data.keys()

2026-01-29T16:50:22 - Initializing GlurooDataLoader with use_cached=True
2026-01-29T16:50:22 - Found cached Parquet data; skipping processing.
2026-01-29T16:50:22 - Loading all patients from 1 Parquet file(s) into processed_data...
2026-01-29T16:50:22 - Loaded 4 patients into processed_data.


dict_keys(['gluroo_0', 'gluroo_7', 'gluroo_8', 'gluroo_11'])

In [5]:
import pandas as pd

path = "/Users/tonychan/GlucoseML/nocturnal-hypo-gly-prob-forecast/cache/data/gluroo/processed/parquet/partition=000/batch_000001.parquet"
df = pd.read_parquet(path)

In [7]:
import pandas as pd

from src.data.models import ColumnNames

# Combine all patients' data into a single DataFrame, add a column for patient_id if not present
all_data = []
for p_num, df in loader.processed_data.items():
    # note that newly filled rows won't have p_num
    df[ColumnNames.P_NUM.value] = p_num
    all_data.append(df)
if all_data:
    df_all = pd.concat(all_data, ignore_index=True)
    df_all.to_csv("gluroo_processed_data.csv", index=False)
else:
    print("No processed data found in loader.processed_data.")

## Iterable Dataset - WIP

In [4]:
stream_ds = loader.get_hf_streaming_dataset(
    columns=["datetime", "p_num", "bg_mM", "food_g", "dose_units", "cob", "iob"],
    # patient_ids=["gluroo_1", "gluroo_2"],  # drop to None to check all data
    batch_size=1024,
    validate_non_empty=True,  # default; set False if you donâ€™t want the peek
)
# first_batch = next(iter(stream_ds))

In [None]:
# first_batch

{'datetime': [Timestamp('2024-06-28 15:48:00+0000', tz='UTC'),
  Timestamp('2024-06-28 15:51:00+0000', tz='UTC'),
  Timestamp('2024-06-28 15:54:00+0000', tz='UTC'),
  Timestamp('2024-06-28 15:57:00+0000', tz='UTC'),
  Timestamp('2024-06-28 16:00:00+0000', tz='UTC'),
  Timestamp('2024-06-28 16:03:00+0000', tz='UTC'),
  Timestamp('2024-06-28 16:06:00+0000', tz='UTC'),
  Timestamp('2024-06-28 16:09:00+0000', tz='UTC'),
  Timestamp('2024-06-28 16:12:00+0000', tz='UTC'),
  Timestamp('2024-06-28 16:15:00+0000', tz='UTC'),
  Timestamp('2024-06-28 16:18:00+0000', tz='UTC'),
  Timestamp('2024-06-28 16:21:00+0000', tz='UTC'),
  Timestamp('2024-06-28 16:24:00+0000', tz='UTC'),
  Timestamp('2024-06-28 16:27:00+0000', tz='UTC'),
  Timestamp('2024-06-28 16:30:00+0000', tz='UTC'),
  Timestamp('2024-06-28 16:33:00+0000', tz='UTC'),
  Timestamp('2024-06-28 16:36:00+0000', tz='UTC'),
  Timestamp('2024-06-28 16:39:00+0000', tz='UTC'),
  Timestamp('2024-06-28 16:42:00+0000', tz='UTC'),
  Timestamp('2024-0

In [None]:
import torch
from transformers import TrainingArguments
import os

from src.models.ttm.model import create_ttm_model
from src.models.ttm.ttm import get_model

# Build model and training args
ttm = create_ttm_model(model_path="ibm-granite/granite-timeseries-ttm-r2")
model = ttm.model
out_dir = "./out"
batch_size = 1024
finetune_forecast_args = TrainingArguments(
    output_dir=out_dir,
    overwrite_output_dir=False,
    # learning_rate=learning_rate,
    # num_train_epochs=num_epochs,
    do_eval=True,
    eval_strategy="steps",
    eval_steps=1000,  # Evaluate every 1000 steps (less frequent = faster training)
    fp16=False,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    dataloader_num_workers=1,
    remove_unused_columns=False,
    report_to="none",
    save_strategy="steps",
    logging_strategy="steps",
    logging_steps=100,  # Log every 100 steps
    logging_first_step=True,  # Log the first step
    save_steps=2000,  # Save checkpoints every 2000 steps
    save_total_limit=100,
    max_steps=5000,  # Required for streaming datasets (no __len__); LR scheduler needs known total steps
    logging_dir=os.path.join(out_dir, "logs"),
    load_best_model_at_end=True,  # Load the best model when training ends
    metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
    greater_is_better=False,  # For loss
    use_cpu=False,
    # Additional logging control
    log_level="info",  # Control log verbosity
    disable_tqdm=False,  # Keep progress bars
)
# fp16 is only supported on CUDA; disable on MPS/CPU to avoid "fp16 requires a GPU (not 'mps')"
if not torch.cuda.is_available():
    finetune_forecast_args.fp16 = False
    finetune_forecast_args.bf16 = False
    os.environ["ACCELERATE_MIXED_PRECISION"] = "no"

# # Build train dataset from notebook's ts_df (from stream_ds). TTM expects datetime, bg_mM, p_num.
# train_df = ts_df.reset_index().rename(
#     columns={"timestamp": "datetime", "target": "bg_mM", "item_id": "p_num"}
# )
# train_loader, val_loader, _ = ttm._prepare_data(train_data=train_df)
# train_dataset = train_loader.dataset if train_loader else None
# eval_dataset = val_loader.dataset if val_loader else Non
#
#
finetune_forecast_model = get_model(
    "ibm-granite/granite-timeseries-ttm-r2",
    context_length=512,
    prediction_length=96,
    freq_prefix_tuning=False,
    prefer_l1_loss=False,
    prefer_longer_context=True,
    # Can also provide TTM Config args. A param?
    loss="mse",
    quantile=0.5,
)

# Import the pickle-safe data collator from the gluroo module

# Use Trainer directly
# trainer = Trainer(
#     model=finetune_forecast_model,
#     args=finetune_forecast_args,
#     train_dataset=stream_ds,
#     eval_dataset=stream_ds,  # For testing
#     data_collator=gluroo_data_collator,
# )
# trainer.train()
# trainer.save_model()