In [None]:
# Jupyter notebook Specific imports
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Imports injecting into namespace
from tqdm.auto import tqdm
tqdm.pandas()

# import sys
# sys.path.append('../../')

In [None]:
# General imports
import os
import json
import pickle
from pathlib import Path

import pandas as pd
import numpy as np
from getpass import getpass
import argparse

from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import NotFittedError

import torch as T
from torch import nn
from pytorch_lightning import Trainer

from lightsaber import constants as C
import lightsaber.data_utils.utils as du
from lightsaber.data_utils import pt_dataset as ptd
from lightsaber.trainers import pt_trainer as ptr

from lightsaber.model_lib.pt_sota_models import rnn

In [None]:
import logging
log = logging.getLogger()

In [None]:
# data_dir = Path(getpass())  # enter or REPLACE with your data path containing the mimic files
data_dir = Path('./data')

assert data_dir.is_dir()

expt_conf = du.yaml.load(open('./ihm_expt_config.yml').read().format(DATA_DIR=data_dir),
                         Loader=du._Loader)

# Model Training

### Reading data along with usage of pre-processor

In [None]:
preprocessor = StandardScaler()
train_filter = [ptd.filter_preprocessor(cols=expt_conf['numerical'], 
                                        preprocessor=preprocessor,
                                        refit=True),
                ptd.filter_fillna(fill_value=expt_conf['normal_values'],
                                  time_order_col=expt_conf['time_order_col'])
                ]
transform = ptd.transform_drop_cols(cols_to_drop=expt_conf['time_order_col'])

In [None]:
train_dataset = ptd.BaseDataset(tgt_file=expt_conf['train']['tgt_file'],
                                feat_file=expt_conf['train']['feat_file'],
                                idx_col=expt_conf['idx_cols'],
                                tgt_col=expt_conf['tgt_col'],
                                feat_columns=expt_conf['feat_cols'],
                                time_order_col=expt_conf['time_order_col'],
                                category_map=expt_conf['category_map'],
                                transform=transform,
                                filter=train_filter,
                               )
# print(train_dataset.data.head())
print(train_dataset.shape, len(train_dataset))

In [None]:
# For other datasets use fitted preprocessors
fitted_filter = [ptd.filter_preprocessor(cols=expt_conf['numerical'], 
                                         preprocessor=preprocessor, refit=False),
                 ptd.filter_fillna(fill_value=expt_conf['normal_values'],
                                   time_order_col=expt_conf['time_order_col'])
                 ]

In [None]:
val_dataset = ptd.BaseDataset(tgt_file=expt_conf['val']['tgt_file'],
                              feat_file=expt_conf['val']['feat_file'],
                              idx_col=expt_conf['idx_cols'],
                              tgt_col=expt_conf['tgt_col'],
                              feat_columns=expt_conf['feat_cols'],
                              time_order_col=expt_conf['time_order_col'],
                              category_map=expt_conf['category_map'],
                              transform=transform,
                              filter=fitted_filter,
                              )
print(val_dataset.shape, len(val_dataset))

In [None]:
test_dataset = ptd.BaseDataset(tgt_file=expt_conf['test']['tgt_file'],
                               feat_file=expt_conf['test']['feat_file'],
                               idx_col=expt_conf['idx_cols'],
                               tgt_col=expt_conf['tgt_col'],
                               feat_columns=expt_conf['feat_cols'],
                               time_order_col=expt_conf['time_order_col'],
                               category_map=expt_conf['category_map'],
                               transform=transform,
                               filter=fitted_filter,
                               )
print(test_dataset.shape, len(test_dataset))

In [None]:
# For most models you need to change only this part
input_dim, target_dim = train_dataset.shape
output_dim = 2

weight_labels = train_dataset.target.iloc[:, 0].value_counts()
weight_labels = (weight_labels.max() / ((weight_labels + 0.0000001) ** (1)))
weight_labels.sort_index(inplace=True)
weights = T.FloatTensor(weight_labels.values).to(train_dataset.device)
print(weights)

## Single Run

In [None]:
# For most models you need to change only this part
hparams = argparse.Namespace(gpus=[0],
                             lr=0.01,
                             max_epochs=100,
                             batch_size=32,
                             hidden_dim=32,
                             rnn_class='LSTM',
                             n_layers=2,
                             dropout=0.1,
                             recurrent_dropout=0.1,
                             bidirectional=False,
                             )

hparams.rnn_class = C.PYTORCH_CLASS_DICT[hparams.rnn_class]

base_model = rnn.RNNClassifier(input_dim, output_dim, 
                               hidden_dim=hparams.hidden_dim,
                               rnn_class=hparams.rnn_class,
                               n_layers=hparams.n_layers,
                               dropout=hparams.dropout,
                               recurrent_dropout=hparams.recurrent_dropout,
                               bidirectional=hparams.bidirectional
                              )

criterion = nn.CrossEntropyLoss(weight=weights)
# optimizer = T.optim.Adam(base_model.parameters(),
#                          lr=hparams.lr,
#                          weight_decay=1e-5  # standard value)
#                          )

# scheduler = T.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')

# Creating the wrapped model
wrapped_model = ptr.PyModel(hparams, base_model,
                            train_dataset=train_dataset,
                            val_dataset=val_dataset, # None
                            test_dataset=test_dataset, # test_dataset
                            #optimizer=optimizer,
                            loss_func=criterion,
                            #scheduler=scheduler,
                            collate_fn=ptd.collate_fn
                            )

In [None]:
# Training 
overfit_pct, fast_dev_run, terminate_on_nan, auto_lr_find = 0, False, True, False

trainer = Trainer(max_epochs=hparams.max_epochs, 
                  gpus=hparams.gpus,
                  default_root_dir=os.path.join('./out/', 'classifier_ihm'),
                  terminate_on_nan=terminate_on_nan,
                  auto_lr_find=auto_lr_find,
                  overfit_pct=overfit_pct,
                  fast_dev_run=fast_dev_run #True if devugging
                 )

In [None]:
mlflow_conf = dict(experiment_name=f'classifier_ihm')
artifacts = dict(preprocessor=preprocessor, 
                 weight_labels=weight_labels,
                )
experiment_tags = dict(model='RNNClassifier',
                       input_dim=input_dim,
                       output_dim=output_dim
                      )

(run_id, metrics, 
 val_y, val_yhat, val_pred_proba, 
 test_y, test_yhat, test_pred_proba) = ptr.run_training_with_mlflow(mlflow_conf, 
                                                                    trainer, 
                                                                    wrapped_model, 
                                                                    overfit_pct=overfit_pct,
                                                                    artifacts=artifacts,
                                                                    **experiment_tags)

print(f"MLFlow Experiment: {mlflow_conf['experiment_name']} \t | Run ID: {run_id}")
print(metrics)

# Model Registration

This block shows how to register a model for subsequent steps. Given a `run_id` this block can be run independtly of other aspects

Steps:

- a saved model (along with hyper-params and weights) is retrieved using `run_id`
- model is initialized using the weights
- model is logged to mlflow under registered model name

In [None]:
print(f"Registering model for run: {run_id}")

In [None]:
# Reading things from mlflow
# Model coders can create functions to repeat this - part of model init
import ast
import six
import torch
from lightsaber.trainers import helper
# data_dir = Path(getpass())  # enter or REPLACE with your data path containing the mimic files
data_dir = Path('./data')

assert data_dir.is_dir()

expt_conf = du.yaml.load(open('./ihm_expt_config.yml').read().format(DATA_DIR=data_dir),
                         Loader=du._Loader)
mlflow_conf = dict(experiment_name=f'classifier_ihm')
registered_model_name = 'classifier_ihm_rnn_v0'

mlflow_setup = helper.setup_mlflow(**mlflow_conf)
run_data = helper.fetch_mlflow_run(run_id, 
                                   mlflow_uri=mlflow_setup['mlflow_uri'],
                                   artifacts_prefix=['artifact/weight_labels']
                                  )

weight_labels = pickle.load(open(helper.get_artifact_path(run_data['artifact_paths'][0],
                                   artifact_uri=run_data['info'].artifact_uri), 'rb'))

weights = T.FloatTensor(weight_labels.values)
_hparams = run_data['params']
# check this for each model
hparams = dict()
for k, v in six.iteritems(_hparams):
    try:
        val = ast.literal_eval(v)
    except Exception:
        val = v
    hparams.setdefault(k, val)
hparams = argparse.Namespace(**hparams)
hparams.rnn_class = helper.import_model_class(hparams.rnn_class.split("'")[1::2][0])

In [None]:
# Recreate models
base_model = rnn.RNNClassifier(input_dim=int(run_data['tags']['input_dim']),
                               output_dim=int(run_data['tags']['output_dim']), 
                               hidden_dim=hparams.hidden_dim,
                               rnn_class=hparams.rnn_class,
                               n_layers=hparams.n_layers,
                               dropout=hparams.dropout,
                               recurrent_dropout=hparams.recurrent_dropout,
                               bidirectional=hparams.bidirectional
                               )
criterion = nn.CrossEntropyLoss(weight=weights)


# Creating the wrapped model
wrapped_model = ptr.PyModel(hparams, base_model,
                            train_dataset=None,
                            val_dataset=None, # None
                            test_dataset=None, # test_dataset
                            cal_dataset=None,
                            loss_func=criterion,
                            collate_fn=ptd.collate_fn
                            )
print('model ready for logging')

In [None]:
# Register model
ptr.register_model_with_mlflow(run_id, mlflow_conf, wrapped_model, 
                               registered_model_name=registered_model_name,
                               test_feat_file=expt_conf['test']['feat_file'],
                               test_tgt_file=expt_conf['test']['tgt_file'],
                               config=os.path.abspath('./ihm_expt_config.yml')
                              )

# Inference on a single patient

In [None]:
patient_id = '10011_episode1_timeseries.csv'
wrapped_model.predict_patient(patient_id, test_dataset)