In [10]:
from src.data.data_loader import load_data, get_train_validation_split
from src.tuning.benchmark import impute_missing_values

df = load_data(use_cached=True)
df, _ = get_train_validation_split(df)

### Use patient 1

In [None]:
patient_1 = df[df["p_num"] == "p01"]

### Reduce the size of the df

In [80]:
# We'll just tune the hyperparams on the first 100 timepoints for patient 1 since we're just testing
# full is 8459
row_to_load = -1
y_feature = ["bg-0:00"]
x_features = [
    "hr-0:00",
    "steps-0:00",
    "cals-0:00",
    "cob",
    "carb_availability",
    "insulin_availability",
    "iob",
]
p_df = df.iloc[:row_to_load][x_features + y_feature]

# Impute with default methods
p_df = impute_missing_values(p_df, columns=x_features)
p_df = impute_missing_values(p_df, columns=y_feature)

### Handle missing values 

In [81]:
from sktime.benchmarking.forecasting import ForecastingBenchmark
from sktime.split import ExpandingWindowSplitter
from sktime.performance_metrics.forecasting import MeanSquaredError


def load_diabetes():
    return (p_df[y_feature], p_df[x_features])


y_diabetes, X_diabetes = load_diabetes()
print(len(X_diabetes))

177023


### Benchmark
With out current setup for testing, our dataset is `patient_1_trimmed`, which only has 200 rows with 2 X features: iob and cob.

In [82]:
import numpy as np
from src.tuning.benchmark import generate_estimators_from_param_grid
from datetime import datetime

benchmark = ForecastingBenchmark(
    backend="loky",  # Use parallel processing
    backend_params={"n_jobs": -1},  # Use all available CPU cores
)

# for 5-min interval patietns use 12 * 24 * 3
# for 15-min interval patients use 4 * 24 * 3
cv_splitter = ExpandingWindowSplitter(
    initial_window=4 * 24 * 3,
    step_length=4 * 24 * 3,
    fh=np.arange(1, 12 * 6 + 1),
)

# PinballLoss is giving us some issues
# TODO: Figure out why
scorers = [MeanSquaredError(square_root=True)]

benchmark.add_task(
    dataset_loader=load_diabetes,
    cv_splitter=cv_splitter,
    scorers=scorers,
    error_score="raise",
)

## Change your yaml path here

In [83]:
import pandas as pd
import os

## Change your yaml path here
yaml_path = "../../src/tuning/configs/1_exponential_smooth_15min.yaml"

estimators = generate_estimators_from_param_grid(yaml_path)
for estimator, estimator_id in estimators:
    benchmark.add_estimator(estimator=estimator, estimator_id=estimator_id)

# Needs to add the datetime to make the file name unique
# so when you run benchmark.run, it doesn't see the file as cached result and not rerun
current_time = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")
yaml_name = yaml_path.split("/")[-1].replace(".yaml", "")

os.makedirs("./results/param_tests", exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = f"./results/param_tests/{current_time}_results_{yaml_name}.csv"
benchmark.run(results_file)

# If there is a file created, then it is probably good enough
if not os.path.exists(results_file):
    raise FileNotFoundError(f"Results file {results_file} was not created")
else:
    print(f"Passed: Results file {results_file} was created")

Training 1 StatsForecastAutoCES models with different parameters


Exception: no model able to be fitted