In [213]:
from src.data.data_loader import load_data
from src.data.data_transforms import create_time_diff_cols
from sktime.transformations.series.impute import Imputer

df = load_data(use_cached=True)
df = create_time_diff_cols(df)

### Use patient 1

In [None]:
patient_1 = df[df["p_num"] == "p02"]
print(len(patient_1))
full_len = len(patient_1)
# Display full dataframe
patient_1

25872
Longest streaks of constant values:
bg-0:00: 24
insulin-0:00: 68
carbs-0:00: 1
hr-0:00: 3
steps-0:00: 1
cals-0:00: 1
time_diff: 286
cob: 442
carb_availability: 442
insulin_availability: 34
iob: 34


Unnamed: 0,id,p_num,time,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00,time_diff,cob,carb_availability,insulin_availability,iob
8459,p02_0,p02,06:05:00,6.7,0.0510,,,,,,NaT,0.000000,0.000000,0.000000,0.000000
8460,p02_1,p02,06:10:00,6.5,0.0567,,,,,,0 days 00:05:00,0.000000,0.000000,0.000090,0.050991
8461,p02_2,p02,06:15:00,6.5,0.0583,,,,,,0 days 00:05:00,0.000000,0.000000,0.000506,0.107539
8462,p02_3,p02,06:20:00,6.5,0.0573,,,,,,0 days 00:05:00,0.000000,0.000000,0.001345,0.165285
8463,p02_4,p02,06:25:00,6.5,0.0580,,,,,,0 days 00:05:00,0.000000,0.000000,0.002607,0.221334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34326,p02_25867,p02,23:35:00,11.7,0.1554,,,,,,0 days 00:05:00,11.151410,8.398847,0.347359,4.594672
34327,p02_25868,p02,23:40:00,12.4,0.1847,,,,,,0 days 00:05:00,10.136391,7.711120,0.336521,4.498041
34328,p02_25869,p02,23:45:00,12.8,0.2652,,,,,,0 days 00:05:00,9.205092,7.068196,0.326310,4.438032
34329,p02_25870,p02,23:50:00,12.5,0.2609,,,,,,0 days 00:05:00,8.351953,6.469143,0.317093,4.465326


### Reduce the size of the df

In [226]:
# We'll just tune the hyperparams on the first 100 timepoints for patient 1 since we're just testing
# full is 8459
row_to_load = 200
cols = ["bg-0:00", "iob", "cob"]

patient_1_trimmed = df.iloc[:row_to_load][cols]
patient_1_trimmed

Unnamed: 0,bg-0:00,iob,cob
0,15.1,0.000000,0.0
1,14.4,0.041261,0.0
2,13.9,0.080352,0.0
3,13.8,0.115980,0.0
4,13.4,0.147553,0.0
...,...,...,...
195,,0.353248,0.0
196,,0.358908,0.0
197,,0.364168,0.0
198,8.6,0.366116,0.0


In [231]:
# Calculate longest streak of same values for each column
streaks = {}
for col in patient_1.columns:
    # Skip non-numeric columns
    if patient_1[col].dtype in ["object", "datetime64[ns]"]:
        continue

    # Get boolean mask of where values are equal to previous value
    is_same = patient_1[col] == patient_1[col].shift()

    # Count consecutive True values
    streak_lengths = []
    current_streak = 0
    for same in is_same:
        if same:
            current_streak += 1
        else:
            if current_streak > 0:
                streak_lengths.append(current_streak)
            current_streak = 0
    if current_streak > 0:
        streak_lengths.append(current_streak)

    # Store max streak length
    streaks[col] = max(streak_lengths) if streak_lengths else 0

print("Longest streaks of constant values:")
for col, streak in streaks.items():
    print(f"{col}: {streak}")

Longest streaks of constant values:
bg-0:00: 24
insulin-0:00: 68
carbs-0:00: 1
hr-0:00: 3
steps-0:00: 1
cals-0:00: 1
time_diff: 286
cob: 442
carb_availability: 442
insulin_availability: 34
iob: 34


### Handle missing values 

In [227]:
from sktime.benchmarking.forecasting import ForecastingBenchmark
from sktime.split import ExpandingWindowSplitter
from sktime.performance_metrics.forecasting import MeanSquaredError


def load_diabetes():
    transform = Imputer(method="linear")
    y = patient_1_trimmed.copy()
    y["bg-0:00"] = transform.fit_transform(y[["bg-0:00"]])
    return (y[["bg-0:00"]], y[["iob"]])


y_diabetes, X_diabetes = load_diabetes()

### Benchmark

In [229]:
from sktime.forecasting.arima import ARIMA

benchmark = ForecastingBenchmark()
cv_splitter = ExpandingWindowSplitter(
    initial_window=3,
    step_length=1,
    fh=1,
)
scorers = [MeanSquaredError(square_root=True)]

benchmark.add_task(
    dataset_loader=load_diabetes,
    cv_splitter=cv_splitter,
    scorers=scorers,
    error_score="raise",
)


estimator = ARIMA()
estimator_id = estimator.__class__.__name__


benchmark.add_estimator(
    estimator=estimator,
    estimator_id=estimator_id,
)

benchmark.run(f"./{estimator_id}_results.csv")



Unnamed: 0,validation_id,model_id,runtime_secs,MeanSquaredError_fold_0_test,MeanSquaredError_fold_1_test,MeanSquaredError_fold_2_test,MeanSquaredError_fold_3_test,MeanSquaredError_fold_4_test,MeanSquaredError_fold_5_test,MeanSquaredError_fold_6_test,...,MeanSquaredError_fold_189_test,MeanSquaredError_fold_190_test,MeanSquaredError_fold_191_test,MeanSquaredError_fold_192_test,MeanSquaredError_fold_193_test,MeanSquaredError_fold_194_test,MeanSquaredError_fold_195_test,MeanSquaredError_fold_196_test,MeanSquaredError_mean,MeanSquaredError_std
0,[dataset=load_diabetes]_[cv_splitter=Expanding...,ARIMA,11.381214,16.43331,0.030131,0.280194,3.000205,0.331808,1.994155,0.959163,...,0.256663,0.66998,0.317997,0.335558,0.350019,0.364351,0.377911,0.527987,1.452037,7.13087
