In [1]:
FEATURES = ['ABP_BaroIndex', 'ABP_HRVstats_RMSSD', 'ABP_HRVstats_SDSD',
       'ABP_HRVpsd_LF', 'ABP_HRVpsd_HF', 'ABP_HRVpsd_LF_to_HF',
       'ABP_HRVpsd_TP', 'ABP_FundAmp', 'HR', 'ICP', 'ETCO2', 'Prx',
       'hour', 'day']
FEATURES_TO_SCALE = ['ABP_BaroIndex', 'ABP_HRVstats_RMSSD', 'ABP_HRVstats_SDSD',
       'ABP_HRVpsd_LF', 'ABP_HRVpsd_HF', 'ABP_HRVpsd_LF_to_HF',
       'ABP_HRVpsd_TP', 'ABP_FundAmp', 'HR', 'ICP', 'ETCO2', 'Prx']
TO_PREDICT = ['ABP']
SEED = 42
VALIDATION_SIZE = 0.2
SCORING = "neg_mean_absolute_percentage_error"

In [2]:
pip install --upgrade numpy

Note: you may need to restart the kernel to use updated packages.


## Imports

In [3]:
import pandas as pd 
from pathlib import Path 
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import (train_test_split, cross_val_score, KFold, GridSearchCV)

In [4]:
data_folder = Path("data_to_learn") 
all_csv = list(data_folder.glob("*.csv"))

In [5]:
data_frames = {}
for file in tqdm(all_csv):
  df = pd.read_csv(file, sep=",", decimal=".")
  df.set_index('DateTime', inplace=True)
  data_frames[file.stem] = df[FEATURES + TO_PREDICT]

100%|██████████| 3/3 [00:00<00:00, 26.62it/s]


---

## Scalling

Do I need to scale `TO_PREDICT`?

In [6]:
for patient in data_frames:
  df = data_frames[patient]
  df = df[FEATURES]
  StandardScaler().fit_transform(df)

---

## Data concactination

In [7]:
learning_data = pd.concat(data_frames.values())
learning_data.columns

Index(['ABP_BaroIndex', 'ABP_HRVstats_RMSSD', 'ABP_HRVstats_SDSD',
       'ABP_HRVpsd_LF', 'ABP_HRVpsd_HF', 'ABP_HRVpsd_LF_to_HF',
       'ABP_HRVpsd_TP', 'ABP_FundAmp', 'HR', 'ICP', 'ETCO2', 'Prx', 'hour',
       'day', 'ABP'],
      dtype='object')

---

## Train/test split

In [8]:
X = learning_data[FEATURES]
y = learning_data[TO_PREDICT]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Grid Selection

In [11]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from statsmodels.tsa.statespace.sarimax import SARIMAX
from arch import arch_model

In [13]:
models = {
    "xgb": XGBRegressor(),
    "knn": KNeighborsRegressor(),
    "sarimax": SARIMAX(y_train, order=(1, 1, 1), seasonal_order=(1, 1, 1, 1440)),
    "garch": arch_model(y_train, vol='Garch', p=1, o=0, q=1)
}

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [None]:
for name, model in zip(models.keys(), models.values()):
  kfold = KFold(n_splits=10, random_state=SEED, shuffle=True)
  cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=SCORING)
  print(f"{name}: {cv_results.mean()} ({cv_results.std()})")