# Forecasting Optuna Search CV 
## Modulus Set 3

**Notebook Goal**
- A modeling pipeline that optimizes the hyperparameters of the sktime forecasters that have the [capavility:pred_int tag](https://www.sktime.net/en/stable/examples/01b_forecasting_proba.html) 
- This notebook will focus on the ones where `i mod 4 = 3` wher `i` is the index of the registry table in the above link.
- The work will be based on this documentation: [ForecastingOptunaSearchCV](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.forecasting.model_selection.ForecastingOptunaSearchCV.html)

In [1]:
from sktime.registry import all_estimators
from src.data.data_loader import load_data
from src.data.data_cleaner import clean_data, perform_train_test_split

In [2]:
# Load the autoreload extension
%load_ext autoreload
# Reload all modules automatically before executing code
%autoreload 2


In [3]:
df = load_data()
# Get the list of forecasters with prediction interval capability
models = all_estimators(
    "forecaster", filter_tags={"capability:pred_int": True}, as_dataframe=True
)

In [4]:
df = clean_data(df) 
print(df.columns)

Index(['id', 'p_num', 'time', 'bg-0:00', 'insulin-0:00', 'carbs-0:00',
       'hr-0:00', 'steps-0:00', 'cals-0:00', 'activity-0:00'],
      dtype='object')


In [5]:
filtered_models = models.iloc[3::4]


## Examples of each patient's data:

### Patient 1

In [6]:
df[df['p_num']=='p01'].head()

Unnamed: 0,id,p_num,time,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00
0,p01_0,p01,06:10:00,15.1,0.0417,,,,,
1,p01_1,p01,06:25:00,14.4,0.0417,,,,,
2,p01_2,p01,06:40:00,13.9,0.0417,,,,,
3,p01_3,p01,06:55:00,13.8,0.0417,,,,,
4,p01_4,p01,07:10:00,13.4,0.0417,,,,,


In [7]:
df[df['p_num']=='p01'].describe()

Unnamed: 0,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00
count,7716.0,8459.0,149.0,8017.0,5895.0,8384.0
mean,8.917146,0.130766,24.469799,77.285007,76.119932,9.029529
std,4.130287,0.695182,13.245885,16.679275,130.683944,6.572246
min,2.2,0.0,4.0,46.6,0.0,4.8
25%,5.6,0.0292,15.0,62.6,0.0,4.8
50%,8.2,0.0417,20.0,76.0,18.0,5.495
75%,11.6,0.0583,30.0,88.1,89.0,10.98
max,27.8,11.7417,80.0,172.4,741.0,53.0


### Patient 2

In [8]:
df[df['p_num']=='p02'].head()

Unnamed: 0,id,p_num,time,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00
8459,p02_0,p02,06:05:00,6.7,0.051,,,,,
8460,p02_1,p02,06:10:00,6.5,0.0567,,,,,
8461,p02_2,p02,06:15:00,6.5,0.0583,,,,,
8462,p02_3,p02,06:20:00,6.5,0.0573,,,,,
8463,p02_4,p02,06:25:00,6.5,0.058,,,,,


In [9]:
df[df['p_num']=='p02'].describe()

Unnamed: 0,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00
count,25757.0,25872.0,329.0,5594.0,4155.0,7494.0
mean,9.336584,0.180665,53.337386,83.364587,132.211793,4.684996
std,2.928298,0.690833,42.650078,17.968259,178.699497,6.94197
min,2.2,0.0,10.0,40.0,1.0,0.03
25%,7.2,0.05,32.0,70.0,31.0,1.66
50%,8.8,0.08025,50.0,80.0,75.0,3.0
75%,11.0,0.1,60.0,93.1,154.0,5.4
max,22.2,18.0833,660.0,185.3,1359.0,116.1


### Patient 3

In [10]:
df[df['p_num']=='p03'].head()

Unnamed: 0,id,p_num,time,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00
34331,p03_0,p03,06:05:00,7.6,0.0833,,,,,
34332,p03_1,p03,06:10:00,7.3,0.11,,,,,
34333,p03_2,p03,06:15:00,7.2,0.1252,,,,,
34334,p03_3,p03,06:20:00,7.2,0.1188,,,,,
34335,p03_4,p03,06:25:00,7.1,0.1167,,,,,


In [11]:
df[df['p_num']=='p03'].describe()

Unnamed: 0,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00
count,25930.0,26028.0,306.0,23329.0,15188.0,24649.0
mean,8.588785,0.270292,82.163399,75.818029,21.644785,8.422431
std,3.144037,1.19446,64.072291,16.81088,65.700752,7.020719
min,2.2,0.0,20.0,47.2,0.0,1.12
25%,6.3,0.0833,60.0,63.0,0.0,5.6
50%,7.9,0.1322,80.0,73.6,0.0,5.84
75%,10.4,0.1917,90.0,83.9,11.0,7.3
max,22.2,46.311,852.0,156.8,582.0,65.85


### Patient 4

In [12]:
df[df['p_num']=='p04'].head()

Unnamed: 0,id,p_num,time,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00
60359,p04_0,p04,06:05:00,6.7,0.0706,,,,,
60360,p04_1,p04,06:10:00,6.6,0.0901,,,,,
60361,p04_2,p04,06:15:00,6.2,0.0756,,,,,
60362,p04_3,p04,06:20:00,6.0,0.0482,,,,,
60363,p04_4,p04,06:25:00,5.7,0.0205,,,,,


In [13]:
df[df['p_num']=='p04'].describe()

Unnamed: 0,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00
count,24532.0,24686.0,462.0,12535.0,9210.0,21898.0
mean,7.765555,0.151833,29.181818,82.841245,46.061889,6.455725
std,2.246324,0.685633,25.883048,16.680994,94.340476,4.235032
min,2.2,0.0,6.0,50.7,0.0,0.91
25%,6.2,0.0455,15.0,70.5,0.0,4.55
50%,7.4,0.0625,24.0,82.0,12.0,4.55
75%,9.0,0.0708,35.0,92.4,46.0,6.18
max,18.4,42.78,332.0,164.4,673.0,42.51


### Patient 5

In [14]:
df[df['p_num']=='p05'].head()

Unnamed: 0,id,p_num,time,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00
85045,p05_0,p05,06:05:00,3.6,0.0729,,,,,
85046,p05_1,p05,06:20:00,3.7,0.0729,,,,,
85047,p05_2,p05,06:35:00,3.8,0.0729,,,,,
85048,p05_3,p05,06:50:00,3.9,0.0729,,,,,
85049,p05_4,p05,07:05:00,4.0,0.0729,,,,,


In [15]:
df[df['p_num']=='p05'].describe()

Unnamed: 0,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00
count,7591.0,8288.0,141.0,6911.0,3489.0,6365.0
mean,8.142485,0.152484,25.765957,74.409044,38.283749,6.855614
std,3.128485,0.568694,13.195583,15.284031,84.71088,4.479553
min,2.2,0.0,2.0,43.7,0.0,4.7
25%,5.7,0.0667,16.0,62.9,0.0,4.75
50%,7.8,0.0729,25.0,74.6,6.0,4.85
75%,10.1,0.0792,35.0,84.2,38.0,6.67
max,22.2,8.1542,85.0,153.7,660.0,45.38


### Patient 6

In [16]:
df[df['p_num']=='p06'].head()

Unnamed: 0,id,p_num,time,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00
93333,p06_0,p06,10:25:00,11.4,0.0833,,,,,
93334,p06_1,p06,10:40:00,11.3,0.0833,,,,,
93335,p06_2,p06,10:55:00,10.9,0.0833,,,,,
93336,p06_3,p06,11:10:00,10.4,0.0833,,,,,
93337,p06_4,p06,11:25:00,10.4,0.0833,,,,,


In [17]:
df[df['p_num']=='p06'].describe()

Unnamed: 0,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00
count,7845.0,8383.0,80.0,6746.0,5325.0,7984.0
mean,8.953499,0.226139,58.075,72.878106,62.233615,11.470765
std,3.806012,1.020254,24.132427,17.816539,114.96793,8.611394
min,2.9,0.0,7.0,39.0,0.0,6.15
25%,6.2,0.0833,40.0,57.4,0.0,6.15
50%,8.1,0.0833,54.5,72.4,8.0,7.52
75%,10.9,0.0875,76.0,85.1,67.0,13.19
max,27.8,14.1833,130.0,152.8,627.0,68.03


### Patient 10

In [18]:
df[df['p_num']=='p10'].head()

Unnamed: 0,id,p_num,time,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00
101716,p10_0,p10,06:05:00,5.1,0.0,,,,,
101717,p10_1,p10,06:10:00,5.4,0.0167,,,,,
101718,p10_2,p10,06:15:00,5.2,0.0696,,,,,
101719,p10_3,p10,06:20:00,5.1,0.0614,,,,,
101720,p10_4,p10,06:25:00,4.9,0.0262,,,,,


In [19]:
df[df['p_num']=='p10'].describe()

Unnamed: 0,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00
count,25324.0,25454.0,356.0,23576.0,13595.0,19260.0
mean,6.373914,0.096138,51.106742,78.180493,60.403384,10.890315
std,1.579262,0.385269,27.14551,17.951637,128.808696,8.570162
min,2.2,0.0,10.0,47.4,0.0,1.41
25%,5.3,0.0147,30.0,62.9,0.0,5.85
50%,6.0,0.05545,50.0,77.5,8.0,7.28
75%,7.2,0.073075,65.0,88.5,50.0,11.99
max,15.9,9.0833,170.0,184.2,783.0,71.77


### Patient 11

Approx 2038 hours of data (24466*5min)/60min, -> 85 days.
- 5 minute interval

In [20]:
df[df['p_num']=='p11'].head()

Unnamed: 0,id,p_num,time,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00
127170,p11_0,p11,06:05:00,9.3,,,,,,
127171,p11_1,p11,06:10:00,9.2,,,,,,
127172,p11_2,p11,06:15:00,9.1,,,,,,
127173,p11_3,p11,06:20:00,9.1,,,,,,
127174,p11_4,p11,06:25:00,9.2,,,,,,


In [21]:
df[df['p_num']=='p11'].describe()

Unnamed: 0,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00
count,24466.0,15180.0,188.0,15360.0,10007.0,23165.0
mean,9.376024,0.166984,38.425532,77.786335,56.05686,8.511119
std,2.882044,0.528628,17.121765,17.152982,112.22407,6.015125
min,2.2,0.0,3.0,49.7,0.0,1.19
25%,7.3,0.075,25.0,63.3,0.0,5.95
50%,9.2,0.1083,38.0,76.0,7.0,5.95
75%,11.3,0.1083,50.0,87.8,53.0,7.62
max,21.6,12.4167,95.0,164.8,660.0,66.39


### Patient 12
Approx 2097 hours of data (25167*5min)/60min, -> 90 days.
- 5 minute interval

In [22]:
df[df['p_num']=='p12'].iloc[0:12*6]

Unnamed: 0,id,p_num,time,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00
151725,p12_0,p12,10:25:00,4.1,0.0,,63.9,,,
151726,p12_1,p12,10:30:00,4.2,0.0,,89.3,,,
151727,p12_2,p12,10:35:00,4.3,0.0,,112.6,,,
151728,p12_3,p12,10:40:00,4.5,0.0,,105.5,,,
151729,p12_4,p12,10:45:00,4.1,0.0,,106.0,,,
...,...,...,...,...,...,...,...,...,...,...
151792,p12_67,p12,16:15:00,7.3,0.0,,118.0,,,
151793,p12_68,p12,16:20:00,7.1,0.0,,116.7,,,
151794,p12_69,p12,16:25:00,6.7,0.0,,114.3,,,
151795,p12_70,p12,16:30:00,6.4,0.0,,114.4,,,


In [23]:
df[df['p_num']=='p12'].describe()

Unnamed: 0,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00
count,25167.0,25299.0,572.0,23822.0,15076.0,22568.0
mean,7.852477,0.375746,53.353147,86.077705,49.680817,14.458452
std,2.828864,1.871322,34.212447,19.243867,70.360571,10.994446
min,2.8,-0.3078,1.0,49.7,0.0,1.76
25%,6.0,0.0,25.0,68.4,0.0,6.75
50%,7.2,0.075,40.0,84.3,18.0,8.1
75%,9.0,0.225,75.0,103.3,77.0,20.69
max,22.2,25.35,143.0,136.6,567.0,57.88


In [8]:
df_test = load_data(dataset_type="test")
df_test[df_test['p_num']=='p02'].head()
# https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.forecasting.model_selection.ForecastingGridSearchCV.html

Unnamed: 0,id,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,...,activity-0:45,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00
244,p02_25872,p02,00:35:00,8.8,8.6,8.9,9.5,10.2,10.5,10.5,...,,,,,,,,,,
245,p02_25873,p02,09:50:00,9.2,9.3,8.9,8.3,7.9,7.7,7.4,...,,,,,,,,,,
246,p02_25874,p02,06:05:00,11.0,12.8,14.8,16.8,17.8,18.4,18.7,...,,,,,,,,,,
247,p02_25875,p02,03:15:00,10.9,10.2,9.5,9.2,8.9,8.5,8.1,...,,,,,,,,,,
248,p02_25876,p02,09:15:00,10.0,9.2,8.7,8.3,7.9,7.6,7.4,...,,,,,,,,,,


In [9]:
df_test["p_num"].unique()   

array(['p01', 'p02', 'p04', 'p05', 'p06', 'p10', 'p11', 'p12', 'p15',
       'p16', 'p18', 'p19', 'p21', 'p22', 'p24'], dtype=object)

In [10]:
df["p_num"].unique()   
# 6 patients for 5min model: 2,3,4,10,11,12
five_min_patients = ['p02', 'p03', 'p04', 'p10', 'p11', 'p12']

In [11]:
all_estimators("metric", as_dataframe=True)

Unnamed: 0,name,object
0,AUCalibration,<class 'sktime.performance_metrics.forecasting...
1,CRPS,<class 'sktime.performance_metrics.forecasting...
2,ConstraintViolation,<class 'sktime.performance_metrics.forecasting...
3,EmpiricalCoverage,<class 'sktime.performance_metrics.forecasting...
4,GeometricMeanAbsoluteError,<class 'sktime.performance_metrics.forecasting...
5,GeometricMeanRelativeAbsoluteError,<class 'sktime.performance_metrics.forecasting...
6,GeometricMeanRelativeSquaredError,<class 'sktime.performance_metrics.forecasting...
7,GeometricMeanSquaredError,<class 'sktime.performance_metrics.forecasting...
8,IntervalWidth,<class 'sktime.performance_metrics.forecasting...
9,LogLoss,<class 'sktime.performance_metrics.forecasting...


## 0: ForecastingGridSearchCV Example
https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.forecasting.model_selection.ForecastingGridSearchCV.html


In [28]:
from sktime.forecasting.model_selection import ForecastingGridSearchCV
from sktime.split import ExpandingSlidingWindowSplitter
from sktime.forecasting.naive import NaiveForecaster
from sktime.performance_metrics.forecasting import MeanSquaredError
y = df[df["p_num"]== "p02"]["bg-0:00"].dropna()
fh = [1,2,3,4,5,6]
cv = ExpandingSlidingWindowSplitter(fh=fh, initial_window=12, step_length=12, max_expanding_window_length=24*12)
forecaster = NaiveForecaster()
param_grid = {"strategy" : ["last", "mean", "drift"]}
gscv = ForecastingGridSearchCV(
    forecaster=forecaster,
    param_grid=param_grid,
    cv=cv,
    scoring=MeanSquaredError(square_root=True),
    )
gscv.fit(y)
y_pred = gscv.predict(fh)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [29]:
gscv.cv_results_

Unnamed: 0,mean_test_MeanSquaredError,mean_fit_time,mean_pred_time,params,rank_test_MeanSquaredError
0,0.840197,0.000974,0.004139,{'strategy': 'last'},1.0
1,2.3333,0.000982,0.001197,{'strategy': 'mean'},3.0
2,0.847771,0.001006,0.001418,{'strategy': 'drift'},2.0


In [30]:
from sktime.datasets import load_shampoo_sales
from sktime.forecasting.exp_smoothing import ExponentialSmoothing
from sktime.forecasting.naive import NaiveForecaster
from sktime.split import ExpandingWindowSplitter
from sktime.forecasting.model_selection import ForecastingGridSearchCV
from sktime.forecasting.compose import TransformedTargetForecaster
from sktime.forecasting.theta import ThetaForecaster
from sktime.transformations.series.impute import Imputer
from sktime.performance_metrics.forecasting import MeanSquaredError

y = df[df["p_num"]== "p02"]["bg-0:00"].dropna()
pipe = TransformedTargetForecaster(steps=[
    ("imputer", Imputer()),
    ("forecaster", NaiveForecaster())])
cv = ExpandingWindowSplitter(
    initial_window=24,
    step_length=12,
    fh=[1,2,3])
gscv = ForecastingGridSearchCV(
    forecaster=pipe,
    param_grid=[{
        "forecaster": [NaiveForecaster(sp=12)],
        "forecaster__strategy": ["drift", "last", "mean"],
    },
    {
        "imputer__method": ["mean", "drift"],
        "forecaster": [ThetaForecaster(sp=12)],
    },
    {
        "imputer__method": ["mean", "median"],
        "forecaster": [ExponentialSmoothing(sp=12)],
        "forecaster__trend": ["add", "mul"],
    },
    ],
    cv=cv,
    scoring=MeanSquaredError(square_root=True)
)  
gscv.fit(y)  
y_pred = gscv.predict(fh=[1,2,3])  

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
                In evaluate, fitting of forecaster TransformedTargetForecaster failed,
                you can set error_score='raise' in evaluate to see
                the exception message.
                Fit failed for the 61-th data split, on training data y_train with
                cutoff <NA>, and len(y_train)=756.
                The score will be set to nan.
                Failed forecaster with parameters: TransformedTargetForecaster(steps=[('imputer', Imputer()),
                                   ('forecaster', ThetaForecaster(sp=12))]).
                
  ret = [fun(x, meta=me

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NAType'

In [12]:
import pandas as pd
example_prediction = df_test[df_test['id'] == 'p02_25874']


In [33]:
example_prediction

Unnamed: 0,id,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,...,activity-0:45,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00
246,p02_25874,p02,06:05:00,11.0,12.8,14.8,16.8,17.8,18.4,18.7,...,,,,,,,,,,


In [14]:
bg_value_vars = ['bg-'+str(hour)+":" for hour in range(0, 6, 1)]
bg_value_vars
all_value_var_lists = []    
var_strs = ['bg-','insulin-', 'carbs-', 'hr-', 'steps-', 'cals-', 'activity-'] 
for var in var_strs:
    var_str = var
    var_list = []
    for hour in range(0, 6, 1):
        time_hour = var_str+str(hour)
        for minutes in range(0, 60, 5):
            if minutes < 10:
                time = time_hour+":0"+str(minutes)
            else:
                time = time_hour+":"+str(minutes)
            var_list.append(time)
    all_value_var_lists.append(var_list)    
    
all_value_var_lists[1]

['insulin-0:00',
 'insulin-0:05',
 'insulin-0:10',
 'insulin-0:15',
 'insulin-0:20',
 'insulin-0:25',
 'insulin-0:30',
 'insulin-0:35',
 'insulin-0:40',
 'insulin-0:45',
 'insulin-0:50',
 'insulin-0:55',
 'insulin-1:00',
 'insulin-1:05',
 'insulin-1:10',
 'insulin-1:15',
 'insulin-1:20',
 'insulin-1:25',
 'insulin-1:30',
 'insulin-1:35',
 'insulin-1:40',
 'insulin-1:45',
 'insulin-1:50',
 'insulin-1:55',
 'insulin-2:00',
 'insulin-2:05',
 'insulin-2:10',
 'insulin-2:15',
 'insulin-2:20',
 'insulin-2:25',
 'insulin-2:30',
 'insulin-2:35',
 'insulin-2:40',
 'insulin-2:45',
 'insulin-2:50',
 'insulin-2:55',
 'insulin-3:00',
 'insulin-3:05',
 'insulin-3:10',
 'insulin-3:15',
 'insulin-3:20',
 'insulin-3:25',
 'insulin-3:30',
 'insulin-3:35',
 'insulin-3:40',
 'insulin-3:45',
 'insulin-3:50',
 'insulin-3:55',
 'insulin-4:00',
 'insulin-4:05',
 'insulin-4:10',
 'insulin-4:15',
 'insulin-4:20',
 'insulin-4:25',
 'insulin-4:30',
 'insulin-4:35',
 'insulin-4:40',
 'insulin-4:45',
 'insulin-4:50

In [16]:
df_list = []    
for val_var in all_value_var_lists:
    temp_df = pd.melt(example_prediction, id_vars=['id','p_num','time'], value_vars=val_var)
    temp_df = temp_df.rename(columns={"variable": val_var[0][:-4]+"time", "value": val_var[0][:-4]+"value"})   
    df_list.append(temp_df)



In [17]:
df_list[0]

Unnamed: 0,id,p_num,time,bg-time,bg-value
0,p02_25874,p02,06:05:00,bg-0:00,7.9
1,p02_25874,p02,06:05:00,bg-0:05,8.1
2,p02_25874,p02,06:05:00,bg-0:10,8.3
3,p02_25874,p02,06:05:00,bg-0:15,8.4
4,p02_25874,p02,06:05:00,bg-0:20,8.6
...,...,...,...,...,...
67,p02_25874,p02,06:05:00,bg-5:35,17.8
68,p02_25874,p02,06:05:00,bg-5:40,16.8
69,p02_25874,p02,06:05:00,bg-5:45,14.8
70,p02_25874,p02,06:05:00,bg-5:50,12.8


In [47]:
bg_df = df_list[0]
insulin_df = df_list[1]
carbs_df = df_list[2]
hr_df = df_list[3]
steps_df = df_list[4]
cals_df = df_list[5]
activity_df = df_list[6]



In [58]:
new_df = pd.concat([bg_df, insulin_df.iloc[:,-1:], carbs_df.iloc[:,-1:], hr_df.iloc[:,-1:], steps_df.iloc[:,-1:], cals_df.iloc[:,-1:], activity_df.iloc[:,-1:]], axis=1)

In [59]:
new_df

Unnamed: 0,id,p_num,time,bg-time,bg-value,insulin-value,carbs-value,hr-value,steps-value,cals-value,activity-value
0,p02_25874,p02,06:05:00,bg-0:00,7.9,0.0533,,,,,
1,p02_25874,p02,06:05:00,bg-0:05,8.1,0.0500,,,,,
2,p02_25874,p02,06:05:00,bg-0:10,8.3,0.0500,,,,,
3,p02_25874,p02,06:05:00,bg-0:15,8.4,0.0748,,,,,
4,p02_25874,p02,06:05:00,bg-0:20,8.6,0.0988,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
67,p02_25874,p02,06:05:00,bg-5:35,17.8,0.0500,,,,,
68,p02_25874,p02,06:05:00,bg-5:40,16.8,0.0902,,,,,
69,p02_25874,p02,06:05:00,bg-5:45,14.8,6.1609,65.0,,,,
70,p02_25874,p02,06:05:00,bg-5:50,12.8,0.2509,,,,,


In [45]:
bg_df

Unnamed: 0,id,p_num,time,bg-time,bg-value
0,p02_25874,p02,06:05:00,bg-0:00,7.9
1,p02_25874,p02,06:05:00,bg-0:05,8.1
2,p02_25874,p02,06:05:00,bg-0:10,8.3
3,p02_25874,p02,06:05:00,bg-0:15,8.4
4,p02_25874,p02,06:05:00,bg-0:20,8.6
...,...,...,...,...,...
67,p02_25874,p02,06:05:00,bg-5:35,17.8
68,p02_25874,p02,06:05:00,bg-5:40,16.8
69,p02_25874,p02,06:05:00,bg-5:45,14.8
70,p02_25874,p02,06:05:00,bg-5:50,12.8


Unnamed: 0,insulin-value
0,0.0533
1,0.0500
2,0.0500
3,0.0748
4,0.0988
...,...
67,0.0500
68,0.0902
69,6.1609
70,0.2509


In [None]:
new_df.concat(bg_df)

Unnamed: 0,id,p_num,time,bg-time,bg-value,insulin-time,insulin-value
0,p02_25874,p02,06:05:00,bg-0:00,7.9,insulin-0:00,0.0533
1,p02_25874,p02,06:05:00,bg-0:00,7.9,insulin-0:05,0.0500
2,p02_25874,p02,06:05:00,bg-0:00,7.9,insulin-0:10,0.0500
3,p02_25874,p02,06:05:00,bg-0:00,7.9,insulin-0:15,0.0748
4,p02_25874,p02,06:05:00,bg-0:00,7.9,insulin-0:20,0.0988
...,...,...,...,...,...,...,...
5179,p02_25874,p02,06:05:00,bg-5:55,11.0,insulin-5:35,0.0500
5180,p02_25874,p02,06:05:00,bg-5:55,11.0,insulin-5:40,0.0902
5181,p02_25874,p02,06:05:00,bg-5:55,11.0,insulin-5:45,6.1609
5182,p02_25874,p02,06:05:00,bg-5:55,11.0,insulin-5:50,0.2509


In [1]:
new_df = pd.merge(new_df, carbs_df, on=['id','p_num','time'], how='left')


NameError: name 'new_df' is not defined

In [39]:
new_df = pd.merge(new_df, hr_df, on=['id','p_num','time'], how='left')

In [40]:
new_df = pd.merge(new_df, steps_df, on=['id','p_num','time'], how='left')

In [None]:
new_df = pd.merge(new_df, cals_df, on=['id','p_num','time'], how='left')

In [None]:
new_df = pd.merge(new_df, activity_df, on=['id','p_num','time'], how='left')

In [96]:
gscv.best_forecaster_

In [71]:
from sktime.split import temporal_train_test_split
target_col = "bg-0:00"
drop_columns = ["bg-0:00","id", "p_num", "time"]
test_size = 0.2
#y_train, y_test, X_train, X_test = perform_train_test_split(df, target_co5l="bg+1:00")
y_train, y_test, X_train, X_test = temporal_train_test_split(y=df[target_col], X=df.drop(columns=drop_columns), test_size=test_size)

In [72]:
X_train.head(30)

Unnamed: 0,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00
0,0.0417,,,,,
1,0.0417,,,,,
2,0.0417,,,,,
3,0.0417,,,,,
4,0.0417,,,,,
5,0.0417,,,,,
6,0.0417,20.0,,,,
7,0.0417,,,,,
8,0.0583,,,,,
9,0.0583,,,,,


In [73]:
y_train.head()

0    15.1
1    14.4
2    13.9
3    13.8
4    13.4
Name: bg-0:00, dtype: float64

### Load Search Space

Load the search space configurations (in the YAML file)

In [74]:
import yaml
from sktime.forecasting.model_selection import ForecastingGridSearchCV
# import optuna
# import optuna.distributions
from sktime.split import ExpandingWindowSplitter
from sktime.forecasting.base import ForecastingHorizon
from sktime.performance_metrics.forecasting import MeanAbsolutePercentageError
from sklearn.utils.discovery import all_estimators as sklearn_all_estimators

sklearn_estimators = sklearn_all_estimators()

# Define forecasting horizon
fh_relative = ForecastingHorizon([1, 2, 3, 4, 5, 6, 7, 8], is_relative=True)
# Cross-validation strategy
cv_relative = ExpandingWindowSplitter(
    initial_window=int(len(y_train) * 0.5), step_length=1, fh=fh_relative
)

In [75]:
def get_sklearn_estimator(model_name: str):
    """
    Gets the associated class for the model name in sklearn
    Args:
        model_name: the name of the model (eg: 'ARDRegression')
    Returns:
        class for the model
    """
    for pair in sklearn_estimators:
        if pair[0] == model_name:
            return pair[1]
    raise ValueError("No model for " + model_name + " in sklearn!")

In [76]:
def map_yaml_to_optuna(param_dict):
    """
    Maps the search grid in the yaml file to distributions used
    by OptunaSearch for search_grid
    Args:
        param_dict: the yaml file contents. See search_space.yaml for an example
    """
    optuna_params = {}
    default_params = {}

    for param, details in param_dict.items():
        # if it's not a dictionary, add the value directly
        if not isinstance(details, dict):
            default_params[param] = details
        elif param.endswith("__param"):
            p_type = details["type"]
            if p_type == "sk_model_param":
                mdl = get_sklearn_estimator(details["model_name"])
                hypers = details["hyperparams"]
                if hypers is None:
                    hypers = {}
                default_params[param.replace("__param", "")] = mdl(**hypers)
        else:
            # Handle the case where the details is a dictionary
            if "type" in details.keys():
                if details["type"] == "optuna_param":
                    continue
                if details["type"] == "int":
                    optuna_params[param] = [details["low"], details["high"]]
                elif details["type"] == "float":
                    optuna_params[param] = [details["low"], details["high"]]
                elif details["type"] == "categorical":
                    optuna_params[param] = details["values"]

                elif details["type"] == "sk_categorical":
                    classes = [
                        get_sklearn_estimator(model_name)(**(params if params else {}))
                        for model_name, params in details["values"].items()
                    ]
                    optuna_params[param] = classes

                # If type is a model name (eg: Forecaster arg for ConformalIntervals), then initiate the model with its params
                elif details.get("type") == "model_name":
                    model_name = details.get("model_name")
                    if model_name and model_name in models["name"].values:
                        print(details.get("hyperparameters", {}))
                        model_cls = models[models["name"] == model_name]["object"].iloc[0]
                        cls_instance = model_cls(**details.get("hyperparameters", {}))
                        default_params[param] = cls_instance
                    else:
                        print(
                            f"Model name '{model_name}' not found in models DataFrame."
                        )

                elif details["type"] == "list":
                    processed_list = []
                    for item in details["values"]:
                        if isinstance(item, list) and item[1] in models["name"].values:
                            print("IS LIST")
                            model_class = models.set_index("name").loc[
                                item[1], "object"
                            ]
                            processed_list.append((item[0], model_class(), item[2]))
                        else:
                            processed_list.append(item)
                    default_params[param] = processed_list
            # If it's another dictionary, process it recursively
            else:
                optuna_params[param] = map_yaml_to_optuna(
                    details
                )  # Recursive call for nested dictionaries

    return optuna_params, default_params

In [77]:
def perform_grid_search(param_space):
    """
    Performs optuna search on the models specified in filtered_models dataframe
    Args:
        param_space: a dictionary of search space. Keys as model values, and values contain a dictionary of the search space. See search_space.yaml for example
    Returns:
        A dictionary. The keys are the model names. The values are as follows:
        {
            "best_forecaster": instance of best one for the model,
            "best_params": best params for the model,
            "best_score": best score for the model,
        }
    """
    best_forecasters = {}

    for _, row in filtered_models.iterrows():
        model_name = row["name"]
        model_class = row["object"]

        # Check if model has hyperparameter space defined in YAML
        if model_name not in param_space.keys():
            print(f"No hyperparameter space defined for {model_name}. Skipping.")
            continue

        # Load the model's hyperparameter space
        optuna_param_distributions, default_params = map_yaml_to_optuna(
            param_space[model_name]
        )
        horizon = param_space[model_name].get("fh", {}).get("value", [])
        # Instantiate the model
        try:
            forecaster = model_class(**default_params)
        except Exception as e:
            print(f"Failed to initialize {model_name}: {e}")
            continue

        # Run OptunaSearch
        optuna_search = ForecastingGridSearchCV(
            forecaster=forecaster,
            param_grid=optuna_param_distributions,
            cv=cv_relative,
            scoring=MeanAbsolutePercentageError(symmetric=False),
            # n_evals=50,
        )

        print(f"Running search for {model_name}...")
        try:
            optuna_search.fit(
                y_train, X_train, fh=horizon if len(horizon) > 0 else None
            )
        except Exception as e:
            print(f"Failed to fit {model_name}: {e}")
            continue

        # Store best results
        best_forecasters[model_name] = {
            "best_forecaster": optuna_search.best_forecaster_,
            "best_params": optuna_search.best_params_,
            "best_score": optuna_search.best_score_,
        }

    return best_forecasters

In [78]:
filtered_models = models[models["name"] == "AutoETS"]
# filtered_models.head()
# models

## Tuning Run Cell

In [None]:
with open("search_space.yaml", "r") as file:
    param_space = yaml.safe_load(file)

# print(param_space['models'].keys())
param_space = param_space["models"]


Running search for AutoETS...


  self.__init__(**params)
                In evaluate, fitting of forecaster AutoETS failed,
                you can set error_score='raise' in evaluate to see
                the exception message.
                Fit failed for the 0-th data split, on training data y_train with
                cutoff <NA>, and len(y_train)=70809.
                The score will be set to nan.
                Failed forecaster with parameters: AutoETS(seasonal='none', sp=7).
                
  ret = [fun(x, meta=meta) for x in iter]
  self.__init__(**params)
  new_object = klass(**new_object_params)
  self.__init__(**params)
  self._fit_forecaster(y, X)
  new_object = klass(**new_object_params)
  self.__init__(**params)
  self._fit_forecaster(y, X)
  self.__init__(**params)
  new_object = klass(**new_object_params)
  self.__init__(**params)
  self._fit_forecaster(y, X)


Failed to fit AutoETS: int() argument must be a string, a bytes-like object or a real number, not 'NAType'


In [82]:
y_train, X_train

(0         15.1
 1         14.4
 2         13.9
 3         13.8
 4         13.4
           ... 
 141614    14.0
 141615    13.1
 141616    12.7
 141617    12.4
 141618    12.2
 Name: bg-0:00, Length: 141619, dtype: float64,
         insulin-0:00  carbs-0:00  hr-0:00  steps-0:00  cals-0:00 activity-0:00
 0             0.0417         NaN      NaN         NaN        NaN           NaN
 1             0.0417         NaN      NaN         NaN        NaN           NaN
 2             0.0417         NaN      NaN         NaN        NaN           NaN
 3             0.0417         NaN      NaN         NaN        NaN           NaN
 4             0.0417         NaN      NaN         NaN        NaN           NaN
 ...              ...         ...      ...         ...        ...           ...
 141614        0.2188         NaN      NaN         NaN       5.95           NaN
 141615        0.2188         NaN      NaN         NaN       6.67           NaN
 141616        0.2188         NaN      NaN         NaN  

In [80]:
res

{}