In [None]:
# Jupyter notebook Specific imports
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Imports injecting into namespace
from tqdm.auto import tqdm
tqdm.pandas()

# import sys
# sys.path.append('../../')

In [None]:
# General imports
import os
import json
import pickle
from pathlib import Path

import pandas as pd
import numpy as np
from getpass import getpass
import argparse

from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import NotFittedError

from lightsaber import constants as C
import lightsaber.data_utils.utils as du
from lightsaber.data_utils.pt_dataset import (filter_preprocessor)
from lightsaber.data_utils import sk_dataloader as skd
from lightsaber.trainers import sk_trainer as skr

from sklearn.ensemble import HistGradientBoostingClassifier

# from pytorch_lightning import Trainer
# import torch as T
# from torch import nn
# from lightsaber.data_utils import pt_dataset as ptd
# from lightsaber.trainers import pt_trainer as ptr
# from lightsaber.model_lib.pt_sota_models import rnn

In [None]:
import logging
log = logging.getLogger()

In [None]:
# data_dir = Path(getpass())  # enter or REPLACE with your data path containing the mimic files
data_dir = Path('./data')

assert data_dir.is_dir()

expt_conf = du.yaml.load(open('./ihm_expt_config.yml').read().format(DATA_DIR=data_dir),
                         Loader=du._Loader)

# Model Training

### Reading data along with usage of pre-processor

In [None]:
flatten = 'sum'
preprocessor = StandardScaler()
train_filter = [filter_preprocessor(cols=expt_conf['numerical'], 
                                    preprocessor=preprocessor,
                                    refit=True),
               ]

train_dataloader = skd.SKDataLoader(tgt_file=expt_conf['train']['tgt_file'],
                                    feat_file=expt_conf['train']['feat_file'],
                                    idx_col=expt_conf['idx_cols'],
                                    tgt_col=expt_conf['tgt_col'],
                                    feat_columns=expt_conf['feat_cols'],
                                    time_order_col=expt_conf['time_order_col'],
                                    category_map=expt_conf['category_map'],
                                    filter=train_filter,
                                    fill_value=expt_conf['normal_values'],
                                    flatten=flatten,
                                   )
print(train_dataloader.shape, len(train_dataloader))

In [None]:
# For other datasets use fitted preprocessors
fitted_filter = [filter_preprocessor(cols=expt_conf['numerical'], 
                                     preprocessor=preprocessor, refit=False),
                 ]

In [None]:
val_dataloader = skd.SKDataLoader(tgt_file=expt_conf['val']['tgt_file'],
                                  feat_file=expt_conf['val']['feat_file'],
                                  idx_col=expt_conf['idx_cols'],
                                  tgt_col=expt_conf['tgt_col'],
                                  feat_columns=expt_conf['feat_cols'],
                                  time_order_col=expt_conf['time_order_col'],
                                  category_map=expt_conf['category_map'],
                                  filter=fitted_filter,
                                  fill_value=expt_conf['normal_values'],
                                  flatten=flatten,
                                )

print(val_dataloader.shape, len(val_dataloader))

In [None]:
test_dataloader = skd.SKDataLoader(tgt_file=expt_conf['test']['tgt_file'],
                                  feat_file=expt_conf['test']['feat_file'],
                                  idx_col=expt_conf['idx_cols'],
                                  tgt_col=expt_conf['tgt_col'],
                                  feat_columns=expt_conf['feat_cols'],
                                  time_order_col=expt_conf['time_order_col'],
                                  category_map=expt_conf['category_map'],
                                  filter=fitted_filter,
                                  fill_value=expt_conf['normal_values'],
                                  flatten=flatten,
                                )

print(test_dataloader.shape, len(test_dataloader))

## Single Run

In [None]:
model_name = 'HistGBT'
hparams = argparse.Namespace(learning_rate=0.01,
                             max_iter=100,
                             l2_regularization=0.01
                             )

base_model = HistGradientBoostingClassifier(learning_rate=hparams.learning_rate, 
                                            l2_regularization=hparams.l2_regularization, 
                                            max_iter=hparams.max_iter)

wrapped_model = skr.SKModel(base_model, hparams, name=model_name)

In [None]:
mlflow_conf = dict(experiment_name=f'classifier_ihm')
artifacts = dict(preprocessor=preprocessor)
experiment_tags = dict(model=model_name, 
                       tune=False)

(run_id, metrics, 
 val_y, val_yhat, val_pred_proba, 
 test_y, test_yhat, test_pred_proba) = skr.run_training_with_mlflow(mlflow_conf, 
                                                                    wrapped_model,
                                                                    train_dataloader=train_dataloader,
                                                                    val_dataloader=val_dataloader,
                                                                    test_dataloader=test_dataloader,
                                                                    artifacts=artifacts,
                                                                    **experiment_tags)

print(f"MLFlow Experiment: {mlflow_conf['experiment_name']} \t | Run ID: {run_id}")
print(metrics)

## Hyper-parameter search

In [None]:
model_name = 'HistGBT'
hparams = argparse.Namespace(learning_rate=0.01,
                             max_iter=100,
                             l2_regularization=0.01
                             )
h_search = dict(
    learning_rate=[0.01, 0.1, 0.02],
    max_iter=[50, 100]
)

base_model = HistGradientBoostingClassifier(**vars(hparams))

wrapped_model = skr.SKModel(base_model, hparams, name=model_name)

In [None]:
mlflow_conf = dict(experiment_name=f'classifier_ihm')
artifacts = dict(preprocessor=preprocessor)
experiment_tags = dict(model=model_name, 
                       tune=True)

(run_id, metrics, 
 val_y, val_yhat, val_pred_proba, 
 test_y, test_yhat, test_pred_proba) = skr.run_training_with_mlflow(mlflow_conf, 
                                                                    wrapped_model,
                                                                    train_dataloader=train_dataloader,
                                                                    val_dataloader=val_dataloader,
                                                                    test_dataloader=test_dataloader,
                                                                    artifacts=artifacts,
                                                                    h_search=h_search,
                                                                    **experiment_tags)

print(f"MLFlow Experiment: {mlflow_conf['experiment_name']} \t | Run ID: {run_id}")
print(metrics)

# Inference on a single patient

In [None]:
patient_id = '10011_episode1_timeseries.csv'
wrapped_model.predict_patient(patient_id, test_dataloader)