# Models dummy tests
---

Testing models from the project defined classes, including the embedding layers and time intervals handling, on dummy datasets.

## Importing the necessary packages

In [None]:
import pandas as pd                        # Pandas to load the data initially
# import modin.pandas as pd                  # Optimized distributed version of Pandas
import numpy as np                         # Mathematical operations package, allowing also for missing values representation
import torch                               # PyTorch for tensor and deep learning operations
import data_utils as du                    # Data science and machine learning relevant methods
import os                                  # os handles directory/workspace changes

In [None]:
du.random_seed

In [None]:
du.set_random_seed(42)

In [None]:
du.random_seed

In [None]:
du.set_pandas_library(lib='pandas')

In [None]:
import pixiedust                           # Debugging in Jupyter Notebook cells

In [None]:
# Change to scripts directory
os.chdir('../../scripts')

In [None]:
import Models                              # Script with all the machine learning model classes

In [None]:
# Change to parent directory (presumably "eICU-mortality-prediction")
os.chdir('..')

## Initializing variables

Data that we'll be using:

In [None]:
dmy_data = np.array([[0, 0, 23, 284, 70, 5, np.nan, 0],
                     [0, 1, 23, 284, 70, 5, 'b', 0],
                     [0, 2, 24, 270, 73, 5, 'b', 0],
                     [0, 3, 22, 290, 71, 5, 'a', 0],
                     [0, 3, 22, 290, 71, 5, 'b', 0],
                     [0, 4, 20, 288, 65, 4, 'a', 1],
                     [0, 4, 20, 288, 65, 4, 'b', 1],
                     [0, 5, 21, 297, 64, 4, 'a', 1],
                     [0, 5, 21, 297, 64, 4, 'b', 1],
                     [0, 5, 21, 297, 64, 4, 'c', 1],
                     [1, 0, 25, 300, 76, 5, 'a', 0],
                     [1, 1, 19, 283, 70, 5, 'c', 0],
                     [1, 2, 19, 306, 59, 5, 'a', 1],
                     [1, 2, 19, 306, 59, 5, 'c', 1],
                     [1, 3, 18, 298, 55, 3, 'c', 1],
                     [2, 0, 20, 250, 70, 5, 'c', 0],
                     [2, 1, 20, 254, 68, 4, 'a', 1],
                     [2, 1, 20, 254, 68, 4, 'c', 1],
                     [2, 2, 19, 244, 70, 3, 'a', 1],
                     [3, 0, 27, 264, 78, 4, 'b', 0],
                     [3, 1, 22, 293, 67, 4, 'b', 1],
                     [4, 0, 28, 290, 73, 5, 'b', 0],
                     [4, 1, 29, 288, 75, 5, 'b', 0],
                     [4, 2, 28, 289, 75, 5, 'b', 0],
                     [4, 5, 26, 290, 62, 5, 'b', 0],
                     [4, 6, 25, 285, 63, 4, 'b', 0],
                     [4, 12, 23, 280, 58, 4, 'b', 0],
                     [4, 12, 23, 280, 58, 4, 'c', 0],
                     [4, 14, 21, 282, 59, 3, 'a', 0],
                     [4, 14, 21, 282, 59, 3, 'b', 0],
                     [4, 14, 21, 282, 59, 3, 'c', 0],
                     [4, 15, 22, 277, 56, 2, 'a', 1],
                     [4, 16, 20, 270, 53, 2, 'a', 1],])

In [None]:
dmy_data

In [None]:
dmy_df = pd.DataFrame(dmy_data, columns=['subject_id', 'ts', 'Var0', 'Var1', 'Var2', 'Var3', 'Var4', 'label'])
dmy_df

In [None]:
dmy_df.dtypes

Fix the columns dtypes:

In [None]:
dmy_df['subject_id'] = dmy_df['subject_id'].astype(int)
dmy_df['ts'] = dmy_df['ts'].astype(int)
dmy_df['Var0'] = dmy_df['Var0'].astype(int)
dmy_df['Var1'] = dmy_df['Var1'].astype(int)
dmy_df['Var2'] = dmy_df['Var2'].astype(int)
dmy_df['Var3'] = dmy_df['Var3'].astype(int)
dmy_df['Var4'] = dmy_df['Var4'].astype(str)
dmy_df['label'] = dmy_df['label'].astype(int)

In [None]:
dmy_df.dtypes

In [None]:
# List of used features
dmy_cols = list(dmy_df.columns)
# Remove features that aren't used by the model to predict the label
for unused_feature in ['subject_id', 'ts', 'label']:
    dmy_cols.remove(unused_feature)

In [None]:
dmy_cols

In [None]:
dmy_df.index

In [None]:
dmy_df['subject_id'] == 0

In [None]:
dmy_df.index[dmy_df['subject_id'] == 4]

In [None]:
dmy_df.iloc[dmy_df.index[dmy_df['subject_id'] == 4]]

## Preparing the dataset

### Encoding categories

Converting the categorical feature `Var4` into one hot encoded columns, so that it can be used by the neural networks and by embedding layers.

~Encode each row's categorical value:~

One hot encode the categorical feature:

In [None]:
# dmy_df['Var4'], enum_dict = du.embedding.enum_categorical_feature(dmy_df, feature='Var4',
#                                                                   nan_value=0, forbidden_digit=0)
# dmy_df

In [None]:
%%time
x1 = pd.get_dummies(dmy_df, columns=['Var4'])
x1.head()

In [None]:
x1.dtypes

In [None]:
%%time
x2 = pd.get_dummies(dmy_df, columns=['Var4'], sparse=True)
x2.head()

In [None]:
x2.dtypes

In [None]:
x2.values

In [None]:
dmy_df, ohe_columns = du.data_processing.one_hot_encoding_dataframe(dmy_df, columns='Var4', 
                                                                    join_rows=False, 
                                                                    get_new_column_names=True, 
                                                                    inplace=True)
dmy_df

In [None]:
ohe_columns

### Joining the rows that have the same identifiers

In [None]:
dmy_df = du.embedding.join_repeated_rows(dmy_df, id_columns=['subject_id', 'ts'])
dmy_df

In [None]:
dmy_df.info(memory_usage='deep')

### Normalizing the features

In [None]:
dmy_df.describe().transpose()

In [None]:
dmy_norm_df, mean, std = du.data_processing.normalize_data(dmy_df, id_columns=['subject_id', 'ts'],
                                                           see_progress=False, get_stats=True)
dmy_norm_df

In [None]:
# dmy_norm_df, mean, std = du.data_processing.normalize_data(dmy_df, id_columns=['subject_id', 'ts'],
#                                                            categ_columns=['Var4'], see_progress=False,
#                                                            get_stats=True)
# dmy_norm_df

In [None]:
# dmy_norm_df, mean, std = du.data_processing.normalize_data(dmy_df, id_columns=['subject_id', 'ts'],
#                                                            columns_to_normalize=False,
#                                                            columns_to_normalize_categ=('Var4', ['Var0', 'Var1', 'Var2', 'Var3']), 
#                                                            see_progress=False, get_stats=True)
# dmy_norm_df

In [None]:
# dmy_norm_df, mean, std = du.data_processing.normalize_data(dmy_df, id_columns=['subject_id', 'ts'],
#                                                            columns_to_normalize=False,
#                                                            columns_to_normalize_categ=('Var4', 'Var0'), 
#                                                            see_progress=False, get_stats=True)
# dmy_norm_df

In [None]:
stats = dict()
for key, _ in mean.items():
    stats[key] = dict()
    stats[key]['mean'] = mean[key]
    stats[key]['std'] = std[key]
stats

In [None]:
dmy_norm_df.describe().transpose()

### Padding

Pad the data so that all sequences have the same length (so that it can be converted to a PyTorch tensor).

In [None]:
padding_value = 999999

In [None]:
seq_len_dict = du.padding.get_sequence_length_dict(dmy_norm_df, id_column='subject_id', ts_column='ts')
seq_len_dict

In [None]:
data = du.padding.dataframe_to_padded_tensor(dmy_norm_df, seq_len_dict=seq_len_dict,
                                             id_column='subject_id', padding_value=padding_value)
data

In [None]:
data.shape

In [None]:
data[0]

In [None]:
data_perm = data.permute(1, 0, 2)
data_perm

In [None]:
data_perm.shape

In [None]:
data_perm[0]

### Dataset object

In [None]:
dataset = du.datasets.Time_Series_Dataset(dmy_norm_df, data)

### Separating into train and validation sets

Since this notebook is only for experimentation purposes, with a very small dummy dataset, we'll not be using a test set.

Training parameters:

In [None]:
batch_size = 32                                 # Number of patients in a mini batch
n_epochs = 100                                  # Number of epochs
lr = 0.001                                      # Learning rate

Separation in train and validation sets:

In [None]:
# Get the train and validation sets data loaders, which will allow loading batches
train_dataloader, val_dataloader, _ = du.machine_learning.create_train_sets(dataset, test_train_ratio=0, 
                                                                            validation_ratio=0.25,
                                                                            batch_size=4, get_indeces=False)

In [None]:
next(iter(train_dataloader))[0]

In [None]:
next(iter(val_dataloader))[0]

In [None]:
dataset.__len__()

## Models testing

### Vanilla LSTM



#### Creating the model

Model parameters:

In [None]:
n_ids = dmy_norm_df.subject_id.nunique()      # Total number of sequences
n_inputs = len(dmy_norm_df.columns)           # Number of input features
n_hidden = 10                                 # Number of hidden units
n_outputs = 1                                 # Number of outputs
n_layers = 2                                  # Number of LSTM layers
p_dropout = 0.2                               # Probability of dropout

Instantiating the model:

In [None]:
model = Models.VanillaLSTM(n_inputs-3, n_hidden, n_outputs, n_layers, p_dropout)
model

#### Training the model

In [None]:
next(model.parameters())

In [None]:
model = du.deep_learning.train(model, train_dataloader, val_dataloader, seq_len_dict=seq_len_dict,
                               batch_size=batch_size, n_epochs=n_epochs, lr=lr, models_path='models/',
                               padding_value=padding_value, do_test=False, log_comet_ml=False)

In [None]:
next(model.parameters())

#### Testing the model

In [None]:
output, metrics = du.deep_learning.model_inference(model, dataloader=val_dataloader, 
                                                   metrics=['loss', 'accuracy', 'AUC'],
                                                   seq_len_dict=seq_len_dict, padding_value=padding_value, 
                                                   output_rounded=False, set_name='test', 
                                                   cols_to_remove=[du.search_explore.find_col_idx(dmy_norm_df, feature)
                                                                   for feature in ['subject_id', 'ts']])
output

In [None]:
metrics

### LSTM with embedding layers

#### Creating the model

Model parameters:

In [None]:
n_ids = dmy_norm_df.subject_id.nunique()      # Total number of sequences
n_inputs = len(dmy_norm_df.columns)           # Number of input features
n_hidden = 10                                 # Number of hidden units
n_outputs = 1                                 # Number of outputs
n_layers = 2                                  # Number of LSTM layers
p_dropout = 0.2                               # Probability of dropout
embed_features = [du.search_explore.find_col_idx(dmy_norm_df, col) for col in ohe_columns] # Indeces fo the features to be emebedded
embed_features.sort()
embedding_dim = 2                             # Number of outputs of the embedding layr

Instantiating the model:

In [None]:
embed_features

In [None]:
model = Models.VanillaLSTM(n_inputs-3, n_hidden, n_outputs, n_layers, p_dropout,
                           embed_features=embed_features, embedding_dim=embedding_dim)
model

In [None]:
model.n_embeddings

#### Training the model

In [None]:
next(model.lstm.parameters())

In [None]:
next(model.embed_layers.parameters())

In [None]:
# model = du.deep_learning.train(model, train_dataloader_df, val_dataloader_df, seq_len_dict=seq_len_dict,
#                                batch_size=batch_size, n_epochs=n_epochs, lr=lr, models_path='models/',
#                                padding_value=padding_value, do_test=False, log_comet_ml=False,
#                                already_embedded=True)

In [None]:
model = du.deep_learning.train(model, train_dataloader, val_dataloader, seq_len_dict=seq_len_dict,
                               batch_size=batch_size, n_epochs=n_epochs, lr=lr, models_path='models/',
                               padding_value=padding_value, do_test=False, log_comet_ml=False,
                               already_embedded=False)

In [None]:
next(model.lstm.parameters())

In [None]:
next(model.embed_layers.parameters())

#### Testing the model

In [None]:
output, metrics = du.deep_learning.model_inference(model, dataloader=val_dataloader, 
                                                   metrics=['loss', 'accuracy', 'AUC'],
                                                   seq_len_dict=seq_len_dict, padding_value=padding_value, 
                                                   output_rounded=False, set_name='test', 
                                                   already_embedded=False,
                                                   cols_to_remove=[du.search_explore.find_col_idx(dmy_norm_df, feature)
                                                                   for feature in ['subject_id', 'ts']])
output

In [None]:
metrics

### LSTM with embedding layers and time interval handling

#### Adding the time difference feature

In [None]:
dmy_df['delta_ts'] = dmy_df.groupby('subject_id').ts.diff()
dmy_df

#### Normalizing the features

In [None]:
dmy_df.describe().transpose()

In [None]:
dmy_df.dtypes

In [None]:
dmy_norm_df = du.data_processing.normalize_data(dmy_df, id_columns=['subject_id', 'ts'],
                                                see_progress=False)
dmy_norm_df

In [None]:
dmy_norm_df.describe().transpose()

#### Imputation

Replace the missing time difference values with the mean (zero).

In [None]:
dmy_norm_df = du.data_processing.missing_values_imputation(dmy_norm_df, method='zero')
dmy_norm_df

#### Padding

Pad the data so that all sequences have the same length (so that it can be converted to a PyTorch tensor).

In [None]:
padding_value = 999999

In [None]:
seq_len_dict = du.padding.get_sequence_length_dict(dmy_norm_df, id_column='subject_id', ts_column='ts')
seq_len_dict

In [None]:
data = du.padding.dataframe_to_padded_tensor(dmy_norm_df, seq_len_dict=seq_len_dict,
                                             id_column='subject_id', padding_value=padding_value)
data

#### Dataset object

In [None]:
dataset = du.datasets.Time_Series_Dataset(dmy_norm_df, data)

#### Separating into train and validation sets

Since this notebook is only for experimentation purposes, with a very small dummy dataset, we'll not be using a test set.

Training parameters:

In [None]:
batch_size = 32                                 # Number of patients in a mini batch
n_epochs = 100                                  # Number of epochs
lr = 0.001                                      # Learning rate

Separation in train and validation sets:

In [None]:
# Get the train and validation sets data loaders, which will allow loading batches
train_dataloader, val_dataloader, _ = du.machine_learning.create_train_sets(dataset, test_train_ratio=0, 
                                                                            validation_ratio=0.25,
                                                                            batch_size=4, get_indeces=False)

In [None]:
next(iter(train_dataloader))[0]

In [None]:
next(iter(val_dataloader))[0]

#### Creating the model

Model parameters:

In [None]:
n_ids = dmy_norm_df.subject_id.nunique()      # Total number of sequences
n_inputs = len(dmy_norm_df.columns)           # Number of input features
n_hidden = 10                                 # Number of hidden units
n_outputs = 1                                 # Number of outputs
n_layers = 2                                  # Number of LSTM layers
p_dropout = 0.2                               # Probability of dropout
embed_features = [du.search_explore.find_col_idx(dmy_norm_df, col) for col in ohe_columns] # Indeces fo the features to be emebedded
embed_features.sort()
embedding_dim = 2                             # Number of outputs of the embedding layer

Instantiating the model:

In [None]:
model = Models.VanillaLSTM(n_inputs-3, n_hidden, n_outputs, n_layers, p_dropout,
                           embed_features=embed_features, embedding_dim=embedding_dim)
model

#### Training the model

In [None]:
next(model.parameters())

In [None]:
next(model.embed_layers.parameters())

In [None]:
model = du.deep_learning.train(model, train_dataloader, val_dataloader, seq_len_dict=seq_len_dict,
                               batch_size=batch_size, n_epochs=n_epochs, lr=lr, models_path='models/',
                               padding_value=padding_value, do_test=False, log_comet_ml=False)

In [None]:
next(model.parameters())

In [None]:
next(model.embed_layers.parameters())

#### Testing the model

In [None]:
output, metrics = du.deep_learning.model_inference(model, dataloader=val_dataloader, 
                                                   metrics=['loss', 'accuracy', 'AUC'],
                                                   seq_len_dict=seq_len_dict, padding_value=padding_value, 
                                                   output_rounded=False, set_name='test', 
                                                   cols_to_remove=[du.search_explore.find_col_idx(dmy_norm_df, feature)
                                                                   for feature in ['subject_id', 'ts']])
output

In [None]:
metrics

### T-LSTM

Implementation of the [_Patient Subtyping via Time-Aware LSTM Networks_](http://biometrics.cse.msu.edu/Publications/MachineLearning/Baytasetal_PatientSubtypingViaTimeAwareLSTMNetworks.pdf) paper.

#### Creating the model

Model parameters:

In [None]:
n_ids = dmy_norm_df.subject_id.nunique()      # Total number of sequences
n_inputs = len(dmy_norm_df.columns)           # Number of input features
n_hidden = 10                                 # Number of hidden units
n_outputs = 1                                 # Number of outputs
n_rnn_layers = 4                              # Number of TLSTM layers
p_dropout = 0.2                               # Probability of dropout
embed_features = [du.search_explore.find_col_idx(dmy_norm_df, col) for col in ohe_columns] # Indeces fo the features to be emebedded
embed_features.sort()
embedding_dim = 2                             # Number of outputs of the embedding layr
# delta_ts_col = du.search_explore.find_col_idx(dmy_norm_df, 'delta_ts')   # Number of the delta_ts column
elapsed_time = 'small'                                                   # Indicates if the elapsed time between events is small or long; influences how to discount elapsed time

In [None]:
n_inputs

In [None]:
dmy_norm_df.columns

In [None]:
embed_features

Instantiating the model:

In [None]:
model = Models.TLSTM(n_inputs-4, n_hidden, n_outputs, n_rnn_layers, p_dropout,
                     embed_features=embed_features, embedding_dim=embedding_dim, 
                     elapsed_time=elapsed_time)
model

In [None]:
model.rnn_layers[0].cell.input_size

In [None]:
model.rnn_layers[0].cell.hidden_size

In [None]:
model.rnn_layers[0].cell.weight_ih.shape

In [None]:
model.rnn_layers[0].cell.delta_ts_col

In [None]:
model.rnn_layers[1].cell.delta_ts_col

#### Training the model

In [None]:
next(model.parameters())

In [None]:
next(model.embed_layers.parameters())

In [None]:
model = du.deep_learning.train(model, train_dataloader, val_dataloader, seq_len_dict=seq_len_dict,
                               batch_size=batch_size, n_epochs=n_epochs, lr=lr, models_path='models/',
                               padding_value=padding_value, do_test=False, log_comet_ml=False,
                               is_custom=True)

In [None]:
next(model.parameters())

In [None]:
next(model.embed_layers.parameters())

#### Testing the model

In [None]:
output, metrics = du.deep_learning.model_inference(model, dataloader=val_dataloader, 
                                                   metrics=['loss', 'accuracy', 'AUC'],
                                                   seq_len_dict=seq_len_dict, padding_value=padding_value, 
                                                   output_rounded=False, set_name='test', 
                                                   cols_to_remove=[du.search_explore.find_col_idx(dmy_norm_df, feature)
                                                                   for feature in ['subject_id', 'ts']])
output

In [None]:
metrics

### MF1-LSTM

Implementation of the [_Predicting healthcare trajectories from medical records: A deep learning approach_](https://doi.org/10.1016/j.jbi.2017.04.001) paper, time decay version.

### MF2-LSTM

Implementation of the [_Predicting healthcare trajectories from medical records: A deep learning approach_](https://doi.org/10.1016/j.jbi.2017.04.001) paper, parametric time version.

### Deep Care with parametric time

Implementation of the [_Predicting healthcare trajectories from medical records: A deep learning approach_](https://doi.org/10.1016/j.jbi.2017.04.001) paper, full parametric time version.