# Allepo Data

In [1]:
from src.data.datasets.aleppo import AleppoDataLoader
import pandas as pd

file_path = "../data_downloads/aleppo_processed.csv"
keep_columns = ["p_num", "date", "bgl"]
aleppo = AleppoDataLoader(file_path=file_path, keep_columns=keep_columns)

train_data = aleppo.train_data
test_data = aleppo.validation_data

In [26]:
train_data.isna().sum()

p_num              0
bg-0:00            0
day_start_shift    0
datetime           0
dtype: int64

In [3]:
test_data

Unnamed: 0_level_0,p_num,bg-0:00,day_start_shift,datetime
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-12-30 07:01:25,2,4.56,2015-12-30,2015-12-30 07:01:25
2015-12-30 07:06:25,2,4.50,2015-12-30,2015-12-30 07:06:25
2015-12-30 07:11:25,2,4.56,2015-12-30,2015-12-30 07:11:25
2015-12-30 07:16:25,2,4.78,2015-12-30,2015-12-30 07:16:25
2015-12-30 07:21:25,2,4.89,2015-12-30,2015-12-30 07:21:25
...,...,...,...,...
2015-11-30 06:38:08,293,9.06,2015-11-30,2015-11-30 06:38:08
2015-11-30 06:43:08,293,9.06,2015-11-30,2015-11-30 06:43:08
2015-11-30 06:48:08,293,9.00,2015-11-30,2015-11-30 06:48:08
2015-11-30 06:53:08,293,8.89,2015-11-30,2015-11-30 06:53:08


In [2]:
from tqdm.notebook import tqdm

patients_by_timestep: dict[int, list[int]] = {}
train_df_by_timestep: dict[int, pd.DataFrame] = {}
train_5min = None
train_15min = None

unique_patients = aleppo.train_data["p_num"].unique()
# Process each patient
for patient in tqdm(
    unique_patients, total=len(unique_patients), desc="Processing patients"
):
    train_patient = aleppo.train_data[aleppo.train_data["p_num"] == patient]
    test_patient = aleppo.validation_data[aleppo.validation_data["p_num"] == patient]

    # use the first 2 rows to figure out the interval
    time_step = (
        train_patient["datetime"].iloc[1] - train_patient["datetime"].iloc[0]
    ).components.minutes

    # Add to appropriate list based on time step
    if time_step not in patients_by_timestep:
        patients_by_timestep[time_step] = []
    patients_by_timestep[time_step].append(patient)

    if time_step not in train_df_by_timestep:
        train_df_by_timestep[time_step] = pd.concat([train_patient])
    else:
        train_df_by_timestep[time_step] = pd.concat(
            [train_df_by_timestep[time_step], train_patient]
        )

for timestep, patients in patients_by_timestep.items():
    print(f"{timestep}-minute patients: {patients}")

Processing patients:   0%|          | 0/226 [00:00<?, ?it/s]

0-minute patients: [2, 3, 8, 14, 17, 19, 22, 29, 31, 32, 40, 48, 49, 52, 54, 58, 61, 69, 70, 74, 76, 80, 98, 105, 108, 112, 118, 123, 124, 127, 132, 134, 140, 145, 149, 158, 163, 166, 167, 168, 177, 179, 181, 184, 186, 189, 197, 203, 209, 215, 218, 223, 224, 226, 239, 243, 244, 252, 260, 265, 267, 291, 292]
5-minute patients: [5, 7, 15, 16, 18, 20, 23, 24, 37, 38, 39, 41, 46, 55, 60, 67, 68, 72, 77, 79, 82, 95, 111, 116, 119, 121, 135, 137, 139, 141, 146, 155, 162, 164, 172, 176, 183, 185, 187, 193, 200, 201, 205, 213, 219, 220, 227, 228, 229, 235, 245, 249, 251, 263, 269, 271, 277, 281, 283, 285]
9-minute patients: [9, 234]
2-minute patients: [10, 109, 175, 211, 233, 256, 276, 284]
35-minute patients: [11]
12-minute patients: [21, 86]
1-minute patients: [26, 27, 42, 45, 53, 62, 73, 87, 97, 102, 103, 113, 138, 148, 156, 190, 214, 217, 236, 248, 273]
46-minute patients: [30]
19-minute patients: [33, 136, 157, 210, 274]
41-minute patients: [35, 101]
25-minute patients: [36, 65, 264]
4-mi

In [3]:
from sktime.transformations.series.impute import Imputer
import pandas as pd
import numpy as np


def impute_missing_values(
    df,
    columns,
    bg_method="linear",
    hr_method="linear",
    step_method="constant",
    cal_method="constant",
) -> pd.DataFrame:
    """Imputes missing values in specified columns of a dataframe using different methods based on the data type.

    Args:
        df (pd.DataFrame): Input dataframe containing missing values
        columns (list): List of column names to impute missing values for
        bg_method (str, optional): Imputation method for blood glucose data.
            Valid values: 'linear', 'nearest'. Defaults to "linear".
        hr_method (str, optional): Imputation method for heart rate data.
            Valid values: 'linear', 'nearest'. Defaults to "linear".
        step_method (str, optional): Imputation method for step count data.
            Valid values: 'constant'.
        cal_method (str, optional): Imputation method for calorie data.
            Valid values: 'constant'.

    Returns:
        pd.DataFrame: Copy of input dataframe with missing values imputed using appropriate methods for each data type
    """
    df_imputed = df.copy()
    transform = None

    for col in columns:
        if col in df.columns:
            if "bg" in col.lower():
                transform = Imputer(method=bg_method)
            elif "hr" in col.lower():
                # Use linear or nearest neighbor interpolation for heart rate
                # TODO: Need more research on this
                transform = Imputer(method=hr_method)
            elif "step" in col.lower():
                # Use constant imputation with 0 for steps
                transform = Imputer(method=step_method, value=0)
            elif "cals" in col.lower():
                # Use constant imputation with minimum value for calories
                min_val = df[col].min()
                transform = Imputer(method=cal_method, value=min_val)

            if transform is not None:
                df_imputed[col] = transform.fit_transform(df[col].to_frame())

    return df_imputed

In [4]:
def prepare_uniform_timeseries(df, interval_minutes=5, resolve_duplicates="drop"):
    """
    Preprocess time series so each patient has identical, regularly spaced timestamps.
    
    Parameters:
        df (pd.DataFrame): MultiIndex ['p_num', 'date'].
        interval_minutes (int): Interval in minutes for interpolation.
        resolve_duplicates (str): 'drop' (default) or 'mean' to resolve timestamp duplicates.
    
    Returns:
        pd.DataFrame: Uniform time series for each patient.
    """
    df = df.sort_index()

    if not isinstance(df.index, pd.MultiIndex) or df.index.names != ["p_num", "date"]:
        raise ValueError("DataFrame must have a MultiIndex with levels ['p_num', 'date']")

    resampled = []

    for p_num, group in df.groupby(level=0):
        group = group.droplevel(0).sort_index()

        # Handle duplicates
        if group.index.duplicated().any():
            if resolve_duplicates == "drop":
                group = group[~group.index.duplicated(keep='first')]
            elif resolve_duplicates == "mean":
                group = group.groupby(group.index).mean()
            else:
                raise ValueError("Invalid resolve_duplicates option: choose 'drop' or 'mean'")

        new_index = pd.date_range(
            start=group.index.min(),
            end=group.index.max(),
            freq=f"{interval_minutes}min"
        )

        group.index = pd.DatetimeIndex(group.index)

        group = group.reindex(new_index)
        # group = group.interpolate(method='linear')
        group["bg-0:00"] = group["bg-0:00"].interpolate(method='linear', limit=12, limit_direction='both')
        group['p_num'] = p_num
        group = group.set_index('p_num', append=True).swaplevel()
        resampled.append(group)

    df_uniform = pd.concat(resampled).sort_index()

    # Trim all patients to shortest available length
    lengths = df_uniform.groupby(level=0).size()
    min_length = lengths.min()

    trimmed = []
    for p_num, group in df_uniform.groupby(level=0):
        trimmed.append(group.iloc[:min_length])

    df_final = pd.concat(trimmed).sort_index()
    return df_final


In [5]:
def split_patients_by_gap(df: pd.DataFrame):
    df_reset = df.reset_index()
    df_reset.sort_values(by=['p_num', 'date'], inplace=True)

    # Calculate time delta per patient
    df_reset['time_delta'] = df_reset.groupby('p_num')['date'].diff()
    gap_threshold = pd.Timedelta(days=2)

    # Create a new group every time a gap exceeds the threshold
    df_reset['new_group'] = (
        df_reset['time_delta'] > gap_threshold
    ).astype(int)

    # Cumulative sum to create unique group IDs within each patient
    df_reset['split_id'] = df_reset.groupby('p_num')['new_group'].cumsum()



In [6]:
timestep = 5
df = train_df_by_timestep[timestep].reset_index()
df = df.set_index(["p_num", "date"])

# Apply the function
df = df[["bg-0:00"]]
df_processed = prepare_uniform_timeseries(df, interval_minutes=5, resolve_duplicates="mean")
# y_train, X_train = reduce_features(df)
# df

In [49]:
df_processed.reset_index()[df_processed.reset_index()["p_num"] == 5]

Unnamed: 0,p_num,level_1,bg-0:00
0,5,2014-12-15 19:49:00,8.00
1,5,2014-12-15 19:54:00,7.83
2,5,2014-12-15 19:59:00,7.56
3,5,2014-12-15 20:04:00,7.28
4,5,2014-12-15 20:09:00,7.17
...,...,...,...
58091,5,2015-07-05 12:44:00,
58092,5,2015-07-05 12:49:00,
58093,5,2015-07-05 12:54:00,
58094,5,2015-07-05 12:59:00,


In [7]:
import plotly.express as px
import polars as pl
# fyi this is before cleaning

# df = pl.DataFrame(y_train.reset_index()).filter(pl.col("p_num") == 5).to_pandas()
tmp = train_data.reset_index()
tmp = tmp[tmp["p_num"] == 5]
fig = px.line(tmp, title="Training Data", x="date", y="bg-0:00")
fig.show()

In [8]:
import plotly.express as px
import polars as pl
# fyi this is before cleaning

# df = pl.DataFrame(y_train.reset_index()).filter(pl.col("p_num") == 5).to_pandas()
tmp = df_processed.reset_index()
tmp = tmp[tmp["p_num"] == 5]
fig = px.line(tmp, title="Training Data", x="level_1", y="bg-0:00")
fig.show()

In [None]:
import polars as pl

# Step 0: Prepare data (reset index and drop rows where "bg-0:00" is missing)
df_clean = pl.DataFrame(df_processed.reset_index()).drop_nulls(subset=["bg-0:00"])

# Step 1: Find patients with >= 1000 entries
patient_counts = (
    df_clean
    .group_by("p_num")
    .agg(pl.count())
    .filter(pl.col("count") >= 1000)
    .select("p_num")
)

# Step 2: Filter original cleaned df to only those patients
df_filtered = df_clean.filter(pl.col("p_num").is_in(patient_counts["p_num"]))

result = (
    df_filtered
    .with_row_index("original_index")  # keep original order info
    .sort(["p_num", "original_index"])  # ensure sorting by patient and original order
    .group_by("p_num")
    .head(1000)  # first 1000 rows per patient
    .drop("original_index")
)

result.group_by("p_num").agg(pl.col("bg-0:00").count())


`pl.count()` is deprecated. Please use `pl.len()` instead.
(Deprecated in version 0.20.5)


`is_in` with a collection of the same datatype is ambiguous and deprecated.
Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.



p_num,bg-0:00
i64,u32
5,1000
7,1000
23,1000
24,1000
37,1000
…,…
249,1000
269,1000
271,1000
277,1000


In [10]:
result_with_enumerated_index = (
    result
    .rename({"level_1": "date"})
    .sort(["p_num", "date"])  # ensure sorted by patient and datetime
    .with_columns(
        pl.arange(0, pl.count()).over("p_num").alias("count_index")
    )
    .drop("date")
)

y_train = (
    result_with_enumerated_index
    .to_pandas()
    .set_index(["p_num", "count_index"])
)

y_train


`pl.count()` is deprecated. Please use `pl.len()` instead.
(Deprecated in version 0.20.5)



Unnamed: 0_level_0,Unnamed: 1_level_0,bg-0:00
p_num,count_index,Unnamed: 2_level_1
5,0,8.00
5,1,7.83
5,2,7.56
5,3,7.28
5,4,7.17
...,...,...
281,995,9.50
281,996,9.44
281,997,9.44
281,998,9.61


In [40]:
from sktime.forecasting.ttm import TinyTimeMixerForecaster
import pandas as pd

# loss_callback = LossPlottingCallback()

current_time = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")

training_samples = len(y_train)
batch_size = 128_000
steps_per_epoch = training_samples // batch_size

TIME_STEP_SIZE = timestep
SAVE_EVERY_EPOCH = 2  # How many epochs to save

# interval = "05mins" if use_5min else "15mins"
interval = "05mins"
dir_path = f"../../src/models/ttm/{interval}/{current_time}"

ttm_forecaster = TinyTimeMixerForecaster(
    config={
        "context_length": (60 // TIME_STEP_SIZE) * 18,  # 18 hours of context length
        "prediction_length": (60 // TIME_STEP_SIZE) * 6,  # 6 hours of prediction length
    },
    training_args={
        "num_train_epochs": 2,
        "output_dir": dir_path,
        "use_cpu": True,
        "per_device_train_batch_size": batch_size,
        "save_steps": steps_per_epoch * SAVE_EVERY_EPOCH,
        # "callbacks": [],  # Add the callback here, why is this not working?
    },
)

print("selected interval: ", interval)
print(f"Training samples: {training_samples}")
print(f"Batch size: {batch_size}")
print(f"Steps per epoch: {steps_per_epoch}")
print(f"Saving a checkpoint every {steps_per_epoch * SAVE_EVERY_EPOCH} steps")

selected interval:  05mins
Training samples: 34000
Batch size: 128000
Steps per epoch: 0
Saving a checkpoint every 0 steps


In [41]:
HOURS_TO_PREDICT = 6
NUM_STEPS = HOURS_TO_PREDICT * 60 // TIME_STEP_SIZE

fh = np.arange(1, NUM_STEPS + 1)

In [42]:
from sktime.forecasting.chronos import ChronosForecaster
chronos_forecaster = ChronosForecaster("amazon/chronos-t5-tiny")
ttm_forecaster.fit(y=y_train, fh=fh)


Invalid configuration detected. The provided values do not satisfy the required condition:
context_length / num_patches == patch_length == patch_stride
Provided configuration:
- context_length: 216
- num_patches: 8
- patch_length: 64
- patch_stride: 64
Configuration has been automatically updated to:
- context_length: 216
- num_patches: 8
- patch_length: 27
- patch_stride: 27

Some weights of TinyTimeMixerForPrediction were not initialized from the model checkpoint at ibm/TTM and are newly initialized because the shapes did not match:
- backbone.encoder.patcher.weight: found shape torch.Size([192, 64]) in the checkpoint and torch.Size([192, 27]) in the model instantiated
- head.base_forecast_block.bias: found shape torch.Size([96]) in the checkpoint and torch.Size([72]) in the model instantiated
- head.base_forecast_block.weight: found shape torch.Size([96, 1024]) in the checkpoint and torch.Size([72, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stre

KEYS IS:
backbone.encoder.patcher.weight
{'missing_keys': [], 'unexpected_keys': [], 'mismatched_keys': ['backbone.encoder.patcher.weight', 'head.base_forecast_block.bias', 'head.base_forecast_block.weight'], 'error_msgs': []}
KEYS IS:
head.base_forecast_block.bias
{'missing_keys': [], 'unexpected_keys': [], 'mismatched_keys': ['backbone.encoder.patcher.weight', 'head.base_forecast_block.bias', 'head.base_forecast_block.weight'], 'error_msgs': []}
KEYS IS:
head.base_forecast_block.weight
{'missing_keys': [], 'unexpected_keys': [], 'mismatched_keys': ['backbone.encoder.patcher.weight', 'head.base_forecast_block.bias', 'head.base_forecast_block.weight'], 'error_msgs': []}


Step,Training Loss


In [None]:
chronos_forecaster.predict()

## Predicting on test

In [15]:
test_data.head()

Unnamed: 0_level_0,p_num,bg-0:00,day_start_shift,datetime
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-12-30 07:01:25,2,4.56,2015-12-30,2015-12-30 07:01:25
2015-12-30 07:06:25,2,4.5,2015-12-30,2015-12-30 07:06:25
2015-12-30 07:11:25,2,4.56,2015-12-30,2015-12-30 07:11:25
2015-12-30 07:16:25,2,4.78,2015-12-30,2015-12-30 07:16:25
2015-12-30 07:21:25,2,4.89,2015-12-30,2015-12-30 07:21:25


In [None]:
y_test = test_data[test_data["p_num"] == 2].reset_index()
prior = y_test[["bg-0:00"]]
predictions = chronos_forecaster.predict(fh=fh, y=prior).rename({"bg-0:00": "y_pred"}, axis=1)


We recommend keeping prediction length <= 64. The quality of longer predictions may degrade since the model is not optimized for it. 



In [35]:
predictions.reset_index()

Unnamed: 0,index,y_pred
0,5616,4.746104
1,5617,4.788670
2,5618,4.767387
3,5619,4.788670
4,5620,4.746104
...,...,...
67,5683,4.672828
68,5684,4.693323
69,5685,4.672828
70,5686,4.693323


In [52]:
y_train[-1000:]

Unnamed: 0_level_0,Unnamed: 1_level_0,bg-0:00
p_num,count_index,Unnamed: 2_level_1
281,0,10.56
281,1,10.67
281,2,10.83
281,3,11.17
281,4,11.94
281,...,...
281,995,9.50
281,996,9.44
281,997,9.44
281,998,9.61


In [58]:
import plotly.express as px
combined_df = pd.concat([y_train[-1000:], predictions], axis=1).reset_index()[-1000:]
# combined_df
fig = px.line(combined_df, x="count_index", y=["bg-0:00", "y_pred"],
            #   hover_data={"date": "|%B %d, %Y"},
              title='bg over index')
fig.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y")
fig.show()

In [72]:
y_test = test_data[test_data["p_num"] == 2].reset_index()
prior = y_test[["bg-0:00"]]
new_baby = ChronosForecaster("amazon/chronos-t5-tiny")
predictions = new_baby.predict(fh=fh, y=y_train[-700:-300]).rename({"bg-0:00": "y_pred"}, axis=1)


We recommend keeping prediction length <= 64. The quality of longer predictions may degrade since the model is not optimized for it. 



In [73]:
padded_predictions = pd.DataFrame({"y_pred": [None] * 400})
padded_preds = pd.concat([padded_predictions, predictions], axis=0).reset_index()
padded_preds


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



Unnamed: 0,index,y_pred
0,0,
1,1,
2,2,
3,3,
4,4,
...,...,...
467,"(281, 767)",10.840604
468,"(281, 768)",10.796892
469,"(281, 769)",10.753180
470,"(281, 770)",10.709467


In [77]:
combined_df

Unnamed: 0,level_0,p_num,count_index,bg-0:00,index,y_pred
0,0,281.0,300.0,16.94,0,
1,1,281.0,301.0,17.39,1,
2,2,281.0,302.0,17.78,2,
3,3,281.0,303.0,18.17,3,
4,4,281.0,304.0,18.50,4,
...,...,...,...,...,...,...
467,467,,,,"(281, 767)",10.840604
468,468,,,,"(281, 768)",10.796892
469,469,,,,"(281, 769)",10.753180
470,470,,,,"(281, 770)",10.709467


In [78]:
import plotly.express as px
combined_df = pd.concat([y_train[-700:-228].reset_index(), padded_preds], axis=1).reset_index()
# combined_df
fig = px.line(combined_df, x="count_index", y=["bg-0:00", "y_pred"],
            #   hover_data={"date": "|%B %d, %Y"},
              title='bg over index')
fig.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y")
fig.show()

# Visualizing y_train

# Benchmarking on BrisT1D

In [1]:
import sktime

sktime.__version__

'0.38.4'

In [2]:
import transformers

transformers.__version__

'4.54.0'