In [1]:
from src.data.data_loader import load_data
from src.data.data_transforms import create_time_diff_cols
from sktime.transformations.series.impute import Imputer

df = load_data(use_cached=True)
df = create_time_diff_cols(df)

### Use patient 1

In [2]:
patient_1 = df[df["p_num"] == "p01"]
print(len(patient_1))
full_len = len(patient_1)
# Display full dataframe
patient_1

8459


Unnamed: 0,id,p_num,time,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00,cob,carb_availability,insulin_availability,iob,time_diff
0,p01_0,p01,06:10:00,15.1,0.0417,,,,,,0.0,0.0,0.000000,0.000000,NaT
1,p01_1,p01,06:25:00,14.4,0.0417,,,,,,0.0,0.0,0.000646,0.041261,0 days 00:15:00
2,p01_2,p01,06:40:00,13.9,0.0417,,,,,,0.0,0.0,0.002091,0.080352,0 days 00:15:00
3,p01_3,p01,06:55:00,13.8,0.0417,,,,,,0.0,0.0,0.003964,0.115980,0 days 00:15:00
4,p01_4,p01,07:10:00,13.4,0.0417,,,,,,0.0,0.0,0.005981,0.147553,0 days 00:15:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8454,p01_8454,p01,22:50:00,6.2,0.0333,,76.8,70.0,5.00,,0.0,0.0,0.012917,0.214648,0 days 00:15:00
8455,p01_8455,p01,23:05:00,5.8,0.0167,,72.2,13.0,5.00,,0.0,0.0,0.012917,0.214648,0 days 00:15:00
8456,p01_8456,p01,23:20:00,5.4,0.0167,,73.4,18.0,5.10,,0.0,0.0,0.012659,0.198222,0 days 00:15:00
8457,p01_8457,p01,23:35:00,4.8,0.0167,,64.7,0.0,7.89,,0.0,0.0,0.012084,0.182661,0 days 00:15:00


### Reduce the size of the df

In [12]:
# We'll just tune the hyperparams on the first 100 timepoints for patient 1 since we're just testing
# full is 8459
row_to_load = 1000
cols = ["bg-0:00", "iob", "cob"]

patient_1_trimmed = df.iloc[:row_to_load][cols]
patient_1_trimmed

Unnamed: 0,bg-0:00,iob,cob
0,15.1,0.000000,0.0
1,14.4,0.041261,0.0
2,13.9,0.080352,0.0
3,13.8,0.115980,0.0
4,13.4,0.147553,0.0
...,...,...,...
995,7.2,3.231677,0.0
996,6.7,2.862434,0.0
997,6.2,2.485640,0.0
998,5.1,2.126249,0.0


### Handle missing values 

In [13]:
from sktime.benchmarking.forecasting import ForecastingBenchmark
from sktime.split import ExpandingWindowSplitter
from sktime.performance_metrics.forecasting import MeanSquaredError


def load_diabetes():
    transform = Imputer(method="linear")
    y = patient_1_trimmed.copy()
    y["bg-0:00"] = transform.fit_transform(y[["bg-0:00"]])
    return (y[["bg-0:00"]], y[["iob", "cob"]])


y_diabetes, X_diabetes = load_diabetes()

### Benchmark
With out current setup for testing, our dataset is `patient_1_trimmed`, which only has 200 rows with 2 X features: iob and cob.

In [14]:
import numpy as np
from src.tuning.benchmark import generate_estimators_from_param_grid
from datetime import datetime
import os

benchmark = ForecastingBenchmark()

# Some models don't like from 0 to 72, so maybe need to start from 1?
cv_splitter = ExpandingWindowSplitter(
    initial_window=360,
    step_length=72,
    fh=np.arange(12 * 6),
)

# PinballLoss is giving us some issues
scorers = [MeanSquaredError(square_root=True)]

benchmark.add_task(
    dataset_loader=load_diabetes,
    cv_splitter=cv_splitter,
    scorers=scorers,
    error_score="raise",
)

## Change your yaml path here

In [19]:
import pandas as pd

## Change your yaml path here
yaml_path = "../../src/tuning/configs/3_structural_15min.yaml"

estimators = generate_estimators_from_param_grid(yaml_path)
for estimator, estimator_id in estimators:
    benchmark.add_estimator(estimator=estimator, estimator_id=estimator_id)

# Needs to add the datetime to make the file name unique
# so when you run benchmark.run, it doesn't see the file as cached result and not rerun
current_time = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")
yaml_name = yaml_path.split("/")[-1].replace(".yaml", "")

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = f"./results/param_tests/{current_time}_results_{yaml_name}.csv"
benchmark.run(results_file)

# If there is a file created, then it is probably good enough
if not os.path.exists(results_file):
    raise FileNotFoundError(f"Results file {results_file} was not created")

Training 1 ARDL models with different parameters
