In [1]:
from src.data.data_loader import load_data
from src.data.data_cleaner import clean_data
import pandas as pd

In [2]:
df = clean_data(data=load_data(), data_source_name="kaggle_brisT1D")

print(df.head())

  return pd.read_csv(file_path, usecols=keep_columns)


      id p_num      time  bg-0:00  insulin-0:00  carbs-0:00    hr-0:00  \
0  p01_0   p01  06:10:00     15.1        0.0417    48.01897  79.335216   
1  p01_1   p01  06:25:00     14.4        0.0417    48.01897  79.335216   
2  p01_2   p01  06:40:00     13.9        0.0417    48.01897  79.335216   
3  p01_3   p01  06:55:00     13.8        0.0417    48.01897  79.335216   
4  p01_4   p01  07:10:00     13.4        0.0417    48.01897  79.335216   

   steps-0:00  cals-0:00  
0   53.052685    9.36896  
1   53.052685    9.36896  
2   53.052685    9.36896  
3   53.052685    9.36896  
4   53.052685    9.36896  




In [3]:
TARGET_COL = [col for col in df.columns if col.startswith("bg-")][0]
TIME_COL = [col for col in df.columns if col.startswith("time")][0]

In [4]:
# endogenous variables are the ones we predict; they are influenced by past values of themselves.
# exogenous variables are external factors that impact the prediction but are not predicted.
# in our case, "bg" is the target (endogenous), while the other metrics are exogenous variables.

EXOGENOUS_PREFIXES = ["activity", "cals", "insulin", "steps", "carbs", "hr"]
EXOGENOUS_COLS = [
    col
    for col in df.columns
    if any([col.startswith(prefix) for prefix in EXOGENOUS_PREFIXES])
]
print(EXOGENOUS_COLS)
print(df[EXOGENOUS_COLS].dtypes)

['insulin-0:00', 'carbs-0:00', 'hr-0:00', 'steps-0:00', 'cals-0:00']
insulin-0:00    float64
carbs-0:00      float64
hr-0:00         float64
steps-0:00      float64
cals-0:00       float64
dtype: object


In [5]:
df = df.dropna(
    subset=[TARGET_COL]
)  # might not be necessary given clean_data() but just in case

# NOT SURE ABOUT THIS IMPUTING
df[EXOGENOUS_COLS] = df[EXOGENOUS_COLS].fillna(0.0)  # fill missing values with 0

In [6]:
from typing import List, Tuple

five_minute_patients: List[Tuple[str, pd.DataFrame]] = []
fifteen_minute_patients: List[Tuple[str, pd.DataFrame]] = []

df[TIME_COL] = pd.to_timedelta(df[TIME_COL])

patient_dfs = df.groupby("p_num")

for p_num, patient_df in patient_dfs:
    time_difference = patient_df.iloc[1][TIME_COL] - patient_df.iloc[0][TIME_COL]

    if time_difference == pd.Timedelta(minutes=5):
        five_minute_patients.append((p_num, patient_df))
    elif time_difference == pd.Timedelta(minutes=15):
        fifteen_minute_patients.append((p_num, patient_df))

In [7]:
# - The "time" column in the data contains only hours and minutes, with no date information.
# - Since the dataset spans multiple days, identical times (e.g., "06:10") can appear on different days.
# - This prevents us from using time directly as an index because it would create duplicate entries.
# - Instead, we use the row number as the index to maintain uniqueness.
# - If we are sure that the data is contiguous and sequential, we could construct a custom time index
#   that accounts for the multi-day nature of the dataset.

p1_num, p1_df = five_minute_patients[0]
p1_df = p1_df.reset_index(drop=True)
p1_df = p1_df.set_index(p1_df.index)

In [8]:
from sktime.forecasting.naive import NaiveForecaster
from sktime.split import ExpandingSlidingWindowSplitter
from sktime.forecasting.model_selection import ForecastingGridSearchCV
from sktime.forecasting.compose import TransformedTargetForecaster
from sktime.transformations.series.impute import Imputer
import numpy as np

y = p1_df[TARGET_COL]
X = p1_df[EXOGENOUS_COLS]

pipe = TransformedTargetForecaster(
    steps=[("imputer", Imputer()), ("forecaster", NaiveForecaster())]
)

cv = ExpandingSlidingWindowSplitter(
    fh=np.arange(96),  # forecasting horizon of 96 indices i.e. 5 * 96 mins = 8 hours
    step_length=12,  # shift forward by 12 indices (1 hour) when sliding (after max window size is reached)
    initial_window=12,  # start with a train window of size 12 (1 hour)
    max_expanding_window_length=96,  # maximum window size of 96 (8 hours)
)

gscv = ForecastingGridSearchCV(
    forecaster=pipe,
    refit=True,  # refit the best model on the whole data
    param_grid=[
        {
            "forecaster": [NaiveForecaster(sp=12)],
            "forecaster__strategy": ["drift", "last"],
        }
    ],
    cv=cv,
)
gscv.fit(y=y, X=X)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [9]:
print(gscv.best_score_)
print(gscv.best_params_)
print(gscv.best_forecaster_)

# use forecasting methods on best forecaster
# since refit=True in ForecastingGridSearchCV, we can use predict directly
# best model is already refitted on the whole data

best_forecaster: TransformedTargetForecaster = gscv.best_forecaster_
res_predict = best_forecaster.predict(X=y, fh=np.arange(96))
res_predict_interval = best_forecaster.predict_interval(fh=np.arange(96))
print(res_predict, res_predict_interval)

gscv.cv_results_

0.33073468206443507
{'forecaster': NaiveForecaster(sp=12), 'forecaster__strategy': 'last'}
TransformedTargetForecaster(steps=[('imputer', Imputer()),
                                   ('forecaster', NaiveForecaster(sp=12))])
       bg-0:00
25871     10.5
25872     10.9
25873     10.8
25874     10.8
25875     11.0
...        ...
25962     11.6
25963     11.7
25964     12.4
25965     12.8
25966     12.5

[96 rows x 1 columns]         bg-0:00           
            0.9           
          lower      upper
25871  6.403964  14.596036
25872  6.803964  14.996036
25873  6.703964  14.896036
25874  6.703964  14.896036
25875  6.903964  15.096036
...         ...        ...
25962  0.014661  23.185339
25963  0.114661  23.285339
25964  0.814661  23.985339
25965  1.214661  24.385339
25966  0.914661  24.085339

[96 rows x 2 columns]


Unnamed: 0,mean_test_MeanAbsolutePercentageError,mean_fit_time,mean_pred_time,params,rank_test_MeanAbsolutePercentageError
0,0.436888,0.012089,0.014346,"{'forecaster': NaiveForecaster(sp=12, strategy...",2.0
1,0.330735,0.011847,0.019488,"{'forecaster': NaiveForecaster(sp=12), 'foreca...",1.0
