# Allepo Data

In [2]:
from src.data.datasets.aleppo import AleppoDataLoader
import pandas as pd

file_path = "../data_downloads/aleppo_processed.csv"
keep_columns = ["p_num", "date", "bgl"]
aleppo = AleppoDataLoader(file_path=file_path, keep_columns=keep_columns)

train_data = aleppo.train_data
test_data = aleppo.validation_data

In [3]:
train_data

Unnamed: 0_level_0,p_num,bg-0:00,day_start_shift,datetime
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-05-22 10:37:40,2,8.00,2015-05-22,2015-05-22 10:37:40
2015-05-22 10:38:32,2,8.56,2015-05-22,2015-05-22 10:38:32
2015-05-22 10:41:29,2,7.94,2015-05-22,2015-05-22 10:41:29
2015-05-22 10:46:29,2,8.39,2015-05-22,2015-05-22 10:46:29
2015-05-22 10:51:29,2,8.28,2015-05-22,2015-05-22 10:51:29
...,...,...,...,...
2015-11-10 06:34:34,293,6.67,2015-11-10,2015-11-10 06:34:34
2015-11-10 06:39:34,293,6.67,2015-11-10,2015-11-10 06:39:34
2015-11-10 06:44:34,293,6.67,2015-11-10,2015-11-10 06:44:34
2015-11-10 06:49:34,293,6.56,2015-11-10,2015-11-10 06:49:34


In [4]:
test_data

Unnamed: 0_level_0,p_num,bg-0:00,day_start_shift,datetime
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-12-30 07:01:25,2,4.56,2015-12-30,2015-12-30 07:01:25
2015-12-30 07:06:25,2,4.50,2015-12-30,2015-12-30 07:06:25
2015-12-30 07:11:25,2,4.56,2015-12-30,2015-12-30 07:11:25
2015-12-30 07:16:25,2,4.78,2015-12-30,2015-12-30 07:16:25
2015-12-30 07:21:25,2,4.89,2015-12-30,2015-12-30 07:21:25
...,...,...,...,...
2015-11-30 06:38:08,293,9.06,2015-11-30,2015-11-30 06:38:08
2015-11-30 06:43:08,293,9.06,2015-11-30,2015-11-30 06:43:08
2015-11-30 06:48:08,293,9.00,2015-11-30,2015-11-30 06:48:08
2015-11-30 06:53:08,293,8.89,2015-11-30,2015-11-30 06:53:08


In [3]:
from tqdm.notebook import tqdm

patients_by_timestep: dict[int, list[int]] = {}
train_df_by_timestep: dict[int, pd.DataFrame] = {}
train_5min = None
train_15min = None

unique_patients = aleppo.train_data["p_num"].unique()
# Process each patient
for patient in tqdm(
    unique_patients, total=len(unique_patients), desc="Processing patients"
):
    train_patient = aleppo.train_data[aleppo.train_data["p_num"] == patient]
    test_patient = aleppo.validation_data[aleppo.validation_data["p_num"] == patient]

    # use the first 2 rows to figure out the interval
    time_step = (
        train_patient["datetime"].iloc[1] - train_patient["datetime"].iloc[0]
    ).components.minutes

    # Add to appropriate list based on time step
    if time_step not in patients_by_timestep:
        patients_by_timestep[time_step] = []
    patients_by_timestep[time_step].append(patient)

    if time_step not in train_df_by_timestep:
        train_df_by_timestep[time_step] = pd.concat([train_patient])
    else:
        train_df_by_timestep[time_step] = pd.concat(
            [train_df_by_timestep[time_step], train_patient]
        )

for timestep, patients in patients_by_timestep.items():
    print(f"{timestep}-minute patients: {patients}")

Processing patients:   0%|          | 0/226 [00:00<?, ?it/s]

0-minute patients: [2, 3, 8, 14, 17, 19, 22, 29, 31, 32, 40, 48, 49, 52, 54, 58, 61, 69, 70, 74, 76, 80, 98, 105, 108, 112, 118, 123, 124, 127, 132, 134, 140, 145, 149, 158, 163, 166, 167, 168, 177, 179, 181, 184, 186, 189, 197, 203, 209, 215, 218, 223, 224, 226, 239, 243, 244, 252, 260, 265, 267, 291, 292]
5-minute patients: [5, 7, 15, 16, 18, 20, 23, 24, 37, 38, 39, 41, 46, 55, 60, 67, 68, 72, 77, 79, 82, 95, 111, 116, 119, 121, 135, 137, 139, 141, 146, 155, 162, 164, 172, 176, 183, 185, 187, 193, 200, 201, 205, 213, 219, 220, 227, 228, 229, 235, 245, 249, 251, 263, 269, 271, 277, 281, 283, 285]
9-minute patients: [9, 234]
2-minute patients: [10, 109, 175, 211, 233, 256, 276, 284]
35-minute patients: [11]
12-minute patients: [21, 86]
1-minute patients: [26, 27, 42, 45, 53, 62, 73, 87, 97, 102, 103, 113, 138, 148, 156, 190, 214, 217, 236, 248, 273]
46-minute patients: [30]
19-minute patients: [33, 136, 157, 210, 274]
41-minute patients: [35, 101]
25-minute patients: [36, 65, 264]
4-mi

In [4]:
from sktime.transformations.series.impute import Imputer
import pandas as pd
import numpy as np


def impute_missing_values(
    df,
    columns,
    bg_method="linear",
    hr_method="linear",
    step_method="constant",
    cal_method="constant",
) -> pd.DataFrame:
    """Imputes missing values in specified columns of a dataframe using different methods based on the data type.

    Args:
        df (pd.DataFrame): Input dataframe containing missing values
        columns (list): List of column names to impute missing values for
        bg_method (str, optional): Imputation method for blood glucose data.
            Valid values: 'linear', 'nearest'. Defaults to "linear".
        hr_method (str, optional): Imputation method for heart rate data.
            Valid values: 'linear', 'nearest'. Defaults to "linear".
        step_method (str, optional): Imputation method for step count data.
            Valid values: 'constant'.
        cal_method (str, optional): Imputation method for calorie data.
            Valid values: 'constant'.

    Returns:
        pd.DataFrame: Copy of input dataframe with missing values imputed using appropriate methods for each data type
    """
    df_imputed = df.copy()
    transform = None

    for col in columns:
        if col in df.columns:
            if "bg" in col.lower():
                transform = Imputer(method=bg_method)
            elif "hr" in col.lower():
                # Use linear or nearest neighbor interpolation for heart rate
                # TODO: Need more research on this
                transform = Imputer(method=hr_method)
            elif "step" in col.lower():
                # Use constant imputation with 0 for steps
                transform = Imputer(method=step_method, value=0)
            elif "cals" in col.lower():
                # Use constant imputation with minimum value for calories
                min_val = df[col].min()
                transform = Imputer(method=cal_method, value=min_val)

            if transform is not None:
                df_imputed[col] = transform.fit_transform(df[col].to_frame())

    return df_imputed

In [None]:
# import pandas as pd
# from src.tuning.benchmark import impute_missing_values


def reduce_features(df):
    p_df = df.copy()

    y_feature = ["bg-0:00"]
    x_features = ["bg-0:00"]
    features = list(set(x_features + y_feature))
    p_df = p_df[features]

    # Get unique instance and time levels
    instance_idx = p_df.index.get_level_values(0).unique()
    time_idx = sorted(p_df.index.get_level_values(1).unique())

    # Reindex each instance to ensure uniform time index
    aligned_dfs = []
    for inst in tqdm(
        instance_idx, total=len(instance_idx), desc="Processing instances (patients)"
    ):
        inst_df = p_df.loc[inst]  # get time-indexed df for this instance
        # Drop duplicate time entries (keep first occurrence)
        inst_df = inst_df[~inst_df.index.duplicated(keep="first")]

        inst_df = inst_df.reindex(time_idx)  # align time index
        inst_df["instance"] = inst
        aligned_dfs.append(inst_df)

    # Combine back into a panel DataFrame with MultiIndex
    aligned_df = pd.concat(aligned_dfs)
    aligned_df.set_index("instance", append=True, inplace=True)
    aligned_df = aligned_df.reorder_levels([1, 0])  # (instance, time)
    aligned_df.sort_index(inplace=True)
    print("Imputing missing values...")

    # Impute missing values caused by reindexing
    aligned_df = impute_missing_values(aligned_df, columns=x_features)
    aligned_df = impute_missing_values(aligned_df, columns=y_feature)

    # Split into y and X
    y = aligned_df[y_feature]
    X = aligned_df[x_features]

    return y, X

In [None]:
import polars as pl
import pandas as pd


def reduce_features_x(df: pd.DataFrame):
    # Define features
    y_feature = ["bg-0:00"]
    x_features = ["bg-0:00"]
    features = list(set(x_features + y_feature))

    # Convert to Polars
    df_pl = pl.from_pandas(df.reset_index(), include_index=False)
    print(df_pl.columns)

    # Drop duplicates on (p_num, datetime)
    df_pl = df_pl.unique(subset=["p_num", "datetime"], keep="first")

    # Get full datetime range
    # all_times = df_pl.select("datetime").unique().sort("datetime").to_series().to_list()

    # Pivot each feature wide by datetime
    wide_dfs = []
    for feature in features:
        print(f"Processing feature: {feature}")
        pivoted = (
            df_pl.select(["p_num", "datetime", feature])
            .pivot(index="p_num", columns="datetime", values=feature)
            .sort("p_num")
        )

        # Rename columns with feature prefix
        pivoted = pivoted.rename(
            {
                col: f"{feature}_{col}" if col != "p_num" else "p_num"
                for col in pivoted.columns
            }
        )
        wide_dfs.append(pivoted)

    # Concatenate all features horizontally
    df_wide = wide_dfs[0]
    for wdf in wide_dfs[1:]:
        df_wide = df_wide.join(wdf, on="p_num", how="inner")

    # Melt back to long
    df_long = df_wide.melt(id_vars="p_num")

    # Split column names into feature and datetime
    df_long = (
        df_long.with_columns(
            [pl.col("variable").str.split("_", inclusive=False).alias("split")]
        )
        .with_columns(
            [
                pl.col("split").arr.get(0).alias("feature"),
                pl.col("split").arr.get(1).alias("datetime_str"),
            ]
        )
        .drop(["split", "variable"])
    )

    # Parse datetime
    df_long = df_long.with_columns(
        [
            pl.col("datetime_str")
            .str.strip_chars()
            .str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S", strict=False)
        ]
    ).rename({"datetime_str": "datetime", "value": "value"})

    # Pivot to wide (columns = features), restore (p_num, datetime) index
    df_final = df_long.pivot(
        index=["p_num", "datetime"], columns="feature", values="value"
    )

    # Convert to pandas
    df_pd = df_final.to_pandas()
    df_pd.set_index(["p_num", "datetime"], inplace=True)
    df_pd.sort_index(inplace=True)

    # Impute missing values
    df_pd = impute_missing_values(df_pd, columns=features)

    # Final split
    y = df_pd[y_feature]
    X = df_pd[x_features]

    return y, X

In [None]:
timestep = 5
# get the first patient
df = train_df_by_timestep[timestep].reset_index()
df = df.set_index(["p_num", "date"]).sort_index()
y_train, X_train = reduce_features(df)

['p_num', 'date', 'bg-0:00', 'day_start_shift', 'datetime']
Processing feature: bg-0:00


  .pivot(index="p_num", columns="datetime", values=feature)
  df_long = df_wide.melt(id_vars="p_num")


: 

In [40]:
X_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,bg-0:00
p_num,date,Unnamed: 2_level_1
5,2014-12-15 19:49:00,8.0
5,2014-12-15 19:54:00,7.83
5,2014-12-15 19:59:00,7.56
5,2014-12-15 20:04:00,7.28
5,2014-12-15 20:09:00,7.17


In [7]:
from sktime.forecasting.ttm import TinyTimeMixerForecaster
import pandas as pd

# loss_callback = LossPlottingCallback()

current_time = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")

training_samples = len(y_train)
batch_size = 128
steps_per_epoch = training_samples // batch_size

TIME_STEP_SIZE = timestep
SAVE_EVERY_EPOCH = 2  # How many epochs to save

# interval = "05mins" if use_5min else "15mins"
interval = "05mins"
dir_path = f"../../src/models/ttm/{interval}/{current_time}"

ttm_forecaster = TinyTimeMixerForecaster(
    config={
        "context_length": (60 // TIME_STEP_SIZE) * 18,  # 18 hours of context length
        "prediction_length": (60 // TIME_STEP_SIZE) * 6,  # 6 hours of prediction length
    },
    training_args={
        "num_train_epochs": 2,
        "output_dir": dir_path,
        "use_cpu": False,
        "per_device_train_batch_size": batch_size,
        "save_steps": steps_per_epoch * SAVE_EVERY_EPOCH,
        # "callbacks": [],  # Add the callback here, why is this not working?
    },
)

print("selected interval: ", interval)
print(f"Training samples: {training_samples}")
print(f"Batch size: {batch_size}")
print(f"Steps per epoch: {steps_per_epoch}")
print(f"Saving a checkpoint every {steps_per_epoch * SAVE_EVERY_EPOCH} steps")

selected interval:  05mins
Training samples: 3957039
Batch size: 128
Steps per epoch: 30914
Saving a checkpoint every 61828 steps


In [8]:
HOURS_TO_PREDICT = 6
NUM_STEPS = HOURS_TO_PREDICT * 60 // TIME_STEP_SIZE

fh = np.arange(1, NUM_STEPS + 1)

In [9]:
ttm_forecaster.fit(y=y_train, X=X_train, fh=fh)

  torch.utils._pytree._register_pytree_node(


config.json: 0.00B [00:00, ?B/s]

context_length / num_patches == patch_length == patch_stride
Provided configuration:
- context_length: 216
- num_patches: 8
- patch_length: 64
- patch_stride: 64
Configuration has been automatically updated to:
- context_length: 216
- num_patches: 8
- patch_length: 27
- patch_stride: 27
  warn(msg)
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/3.24M [00:00<?, ?B/s]

Some weights of TinyTimeMixerForPrediction were not initialized from the model checkpoint at ibm/TTM and are newly initialized because the shapes did not match:
- backbone.encoder.patcher.weight: found shape torch.Size([192, 64]) in the checkpoint and torch.Size([192, 27]) in the model instantiated
- head.base_forecast_block.bias: found shape torch.Size([96]) in the checkpoint and torch.Size([72]) in the model instantiated
- head.base_forecast_block.weight: found shape torch.Size([96, 1024]) in the checkpoint and torch.Size([72, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AssertionError: All series must has the same index

In [1]:
import transformers

transformers.__version__

'4.33.3'