
# Probabilistic Forecasting with `sktime`: Distribution Forecast

## Notebook Goal: 
* Provide an example workflow of a distribution forecast with BGL data using sktime.
* Demonstrate how to use Optuna for a single forecasting type.

In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data Load

In [15]:
from src.data.data_loader import load_data
import pandas as pd

# Load the data with specified columns
df = load_data(data_source_name="kaggle_brisT1D", dataset_type="train")

  return pd.read_csv(file_path, usecols=keep_columns)


In [29]:
def keep_time_zero_columns(
    df, additional_cols: list = ["id", "p_num", "time", "bg+1:00"]
):
    """
    Provide a list of columns to keep that contain '-0:00' in their names

    Args:
        df (pd.DataFrame): Input DataFrame
        additional_cols (list): List of columns to keep in addition to the ones with '-0:00' in their names

    Returns:
        list: list of column names
    """
    time_zero_cols = [col for col in df.columns if "-0:00" in col]
    additional_cols += time_zero_cols
    print(f"Keeping {additional_cols} columns")
    return additional_cols


keep_columns_train = keep_time_zero_columns(df)
keep_columns_test = keep_columns_train.copy()
keep_columns_test.remove("bg+1:00")
print("train columns: ", keep_columns_train)
print("test columns: ", keep_columns_test)

Keeping ['id', 'p_num', 'time', 'bg+1:00', 'bg-0:00', 'insulin-0:00', 'carbs-0:00', 'hr-0:00', 'steps-0:00', 'cals-0:00', 'activity-0:00'] columns
train columns:  ['id', 'p_num', 'time', 'bg+1:00', 'bg-0:00', 'insulin-0:00', 'carbs-0:00', 'hr-0:00', 'steps-0:00', 'cals-0:00', 'activity-0:00']
test columns:  ['id', 'p_num', 'time', 'bg-0:00', 'insulin-0:00', 'carbs-0:00', 'hr-0:00', 'steps-0:00', 'cals-0:00', 'activity-0:00']


In [34]:
train_df = load_data(
    data_source_name="kaggle_brisT1D",
    dataset_type="train",
    keep_columns=keep_columns_train,
)
test_df = load_data(
    data_source_name="kaggle_brisT1D",
    dataset_type="test",
    keep_columns=keep_columns_test,
)

  return pd.read_csv(file_path, usecols=keep_columns)


In [37]:
train_df[8450:8460]

Unnamed: 0,id,p_num,time,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00,bg+1:00
8450,p01_8450,p01,21:50:00,6.3,0.0333,,72.0,0.0,19.05,,6.2
8451,p01_8451,p01,22:05:00,6.6,0.0333,,68.8,0.0,5.87,,5.8
8452,p01_8452,p01,22:20:00,6.8,0.0333,,72.5,0.0,5.4,,5.4
8453,p01_8453,p01,22:35:00,6.5,0.0333,,68.6,18.0,4.9,,4.8
8454,p01_8454,p01,22:50:00,6.2,0.0333,,76.8,70.0,5.0,,4.9
8455,p01_8455,p01,23:05:00,5.8,0.0167,,72.2,13.0,5.0,,5.5
8456,p01_8456,p01,23:20:00,5.4,0.0167,,73.4,18.0,5.1,,6.6
8457,p01_8457,p01,23:35:00,4.8,0.0167,,64.7,0.0,7.89,,7.7
8458,p01_8458,p01,23:50:00,4.9,0.0167,,61.2,,10.78,,8.2
8459,p,02_0,p02,6.7,0.051,,,,,,7.1


In [36]:
test_df

Unnamed: 0,id,p_num,time,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00
0,p01_8459,p01,06:45:00,9.6,0.0417,,54.7,,6.06,
1,p01_8460,p01,11:25:00,4.6,0.0583,,118.7,532.0,32.43,Walk
2,p01_8461,p01,14:45:00,8.0,0.0500,,76.5,69.0,10.16,
3,p01_8462,p01,04:30:00,9.9,0.0583,,59.9,0.0,4.80,
4,p01_8463,p01,04:20:00,5.3,0.0000,,61.0,,4.80,
...,...,...,...,...,...,...,...,...,...,...
3639,p24_256,p24,06:40:00,6.3,0.0726,,62.6,,4.23,
3640,p24_257,p24,12:30:00,10.4,0.0646,,77.3,0.0,5.31,
3641,p24_258,p24,03:45:00,6.9,0.0262,,,,4.15,
3642,p24_259,p24,06:10:00,8.7,0.0750,,71.5,,4.15,


In [12]:
def add_date_index(df):
    """Add implied date index based on time rollovers."""

    def calculate_date_index(group):
        # Sort by time
        group = group.sort_values("time")

        # Initialize date counter
        date_idx = 0
        date_indices = []
        prev_time = None

        for curr_time in group["time"]:
            if prev_time is not None:
                # Check for day rollover (previous time > current time)
                if (
                    prev_time.hour > curr_time.hour
                    and prev_time.hour >= 23
                    and curr_time.hour <= 1
                ):
                    date_idx += 1
            date_indices.append(date_idx)
            prev_time = curr_time

        return pd.Series(date_indices, index=group.index)

    # Apply function to each patient group
    df["date_idx"] = (
        df.groupby("p_num").apply(calculate_date_index).reset_index(level=0, drop=True)
    )

    return df


# Example usage:
df2 = add_date_index(df)
df2

AttributeError: 'str' object has no attribute 'hour'

In [11]:
# Specify the columns to keep
id_vars = ["time"]
feature_cols = ["bg", "insulin", "carbs", "hr", "steps", "cals", "activity"]


features = {}
for feature in feature_cols:
    features[feature] = [col for col in df.columns if feature in col]


def process_row(row, features):
    first_feature = True
    row_df = None
    row_df_temp = pd.DataFrame(row).T

    for feature, feature_cols in features.items():
        melted = row_df_temp.melt(
            id_vars=id_vars,
            value_vars=feature_cols,
            var_name="time_offset",
            value_name=feature,
        )

        if first_feature:
            row_df = melted
            first_feature = False
            melted["time_offset"] = melted["time_offset"].str.replace(f"{feature}", "")
        else:
            row_df[feature] = melted[feature]

    return row_df


def process_patient(patient_df: pd.DataFrame):
    # For each patient, make each row in a dataframe
    rows = []
    print("There are ", len(patient_df), " rows in this patient")
    for id, row in patient_df.iterrows():
        print("Processing row ", id)
        rows.append(process_row(row, features))

        if id == 200:
            break

    return rows


def process_patients(df: pd.DataFrame):
    patients = {}
    # Process each patient
    for p_num, patient_df in df.groupby("p_num"):
        # Key is the patient id
        patient_df.drop(columns=["id", "p_num"], inplace=True)
        patients[p_num] = process_patient(patient_df)

        # Only process the first patient
        break

    return patients


# TODO:
# 1. Remove if (id == 200):
# 2. Remove the break
# 3. Find a way to improve this
# 4. process_patients should take patient number as an argument
# 5. We can retrieve + (which is the predicted value) in other functions
processed_patients = process_patients(df)
processed_patients["p01"][0]

There are  8459  rows in this patient
Processing row  0
Processing row  1
Processing row  2
Processing row  3
Processing row  4
Processing row  5
Processing row  6
Processing row  7
Processing row  8
Processing row  9
Processing row  10
Processing row  11
Processing row  12
Processing row  13
Processing row  14
Processing row  15
Processing row  16
Processing row  17
Processing row  18
Processing row  19
Processing row  20
Processing row  21
Processing row  22
Processing row  23
Processing row  24
Processing row  25
Processing row  26
Processing row  27
Processing row  28
Processing row  29
Processing row  30
Processing row  31
Processing row  32
Processing row  33
Processing row  34
Processing row  35
Processing row  36
Processing row  37
Processing row  38
Processing row  39
Processing row  40
Processing row  41
Processing row  42
Processing row  43
Processing row  44
Processing row  45
Processing row  46
Processing row  47
Processing row  48
Processing row  49
Processing row  50
Pro

Unnamed: 0,time,time_offset,bg,insulin,carbs,hr,steps,cals,activity
0,06:10:00,-5:55,,0.0083,,,,,
1,06:10:00,-5:50,,0.0083,,,,,
2,06:10:00,-5:45,9.6,0.0083,,,,,
3,06:10:00,-5:40,,0.0083,,,,,
4,06:10:00,-5:35,,0.0083,,,,,
...,...,...,...,...,...,...,...,...,...
68,06:10:00,-0:15,16.2,0.0583,,,,,
69,06:10:00,-0:10,,0.0583,,,,,
70,06:10:00,-0:05,,0.0417,,,,,
71,06:10:00,-0:00,15.1,0.0417,,,,,


In [5]:
from sktime.forecasting.arima import ARIMA

# step 1: data specification
y = df["bg+1:00"]
# step 2: specifying forecasting horizon
fh = [1, 2, 3]
# step 3: specifying the forecasting algorithm
forecaster = ARIMA()
# step 4: fitting the forecaster
forecaster.fit(y, fh=[1, 2, 3])
# step 5: querying predictions
y_pred = forecaster.predict()

# for probabilistic forecasting:
#   call a probabilistic forecasting method after or instead of step 5
y_pred_int = forecaster.predict_interval(coverage=0.9)
y_pred_int

: 

In [7]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)