# Models dummy tests

Testing models from the project defined classes, including the embedding layers and time intervals handling, on dummy datasets.

## Importing the necessary packages

In [None]:
import dask.dataframe as dd                # Dask to handle big data in dataframes
import pandas as pd                        # Pandas to load the data initially
from dask.distributed import Client        # Dask scheduler
import numpy as np                         # Mathematical operations package, allowing also for missing values representation
import torch                               # PyTorch for tensor and deep learning operations
import data_utils as du                    # Data science and machine learning relevant methods
import os                                  # os handles directory/workspace changes

In [None]:
du.random_seed

In [None]:
du.set_random_seed(42)

In [None]:
du.random_seed

In [None]:
import pixiedust                           # Debugging in Jupyter Notebook cells

In [None]:
# Change to scripts directory
os.chdir('../../scripts')

In [None]:
from Time_Series_Dataset import Time_Series_Dataset # Dataset class that helps fetching batches of data
import Models                              # Script with all the machine learning model classes

In [None]:
# Change to parent directory (presumably "eICU-mortality-prediction")
os.chdir('..')

## Initializing variables

Data that we'll be using:

In [None]:
dmy_data = np.array([[0, 0, 23, 284, 70, 5, np.nan, 0],
                     [0, 1, 23, 284, 70, 5, 'b', 0],
                     [0, 2, 24, 270, 73, 5, 'b', 0],
                     [0, 3, 22, 290, 71, 5, 'a', 0],
                     [0, 3, 22, 290, 71, 5, 'b', 0],
                     [0, 4, 20, 288, 65, 4, 'a', 1],
                     [0, 4, 20, 288, 65, 4, 'b', 1],
                     [0, 5, 21, 297, 64, 4, 'a', 1],
                     [0, 5, 21, 297, 64, 4, 'b', 1],
                     [0, 5, 21, 297, 64, 4, 'c', 1],
                     [1, 0, 25, 300, 76, 5, 'a', 0],
                     [1, 1, 19, 283, 70, 5, 'c', 0],
                     [1, 2, 19, 306, 59, 5, 'a', 1],
                     [1, 2, 19, 306, 59, 5, 'c', 1],
                     [1, 3, 18, 298, 55, 3, 'c', 1],
                     [2, 0, 20, 250, 70, 5, 'c', 0],
                     [2, 1, 20, 254, 68, 4, 'a', 1],
                     [2, 1, 20, 254, 68, 4, 'c', 1],
                     [2, 2, 19, 244, 70, 3, 'a', 1],
                     [3, 0, 27, 264, 78, 4, 'b', 0],
                     [3, 1, 22, 293, 67, 4, 'b', 1]])

In [None]:
dmy_data

In [None]:
dmy_df = pd.DataFrame(dmy_data, columns=['subject_id', 'ts', 'Var0', 'Var1', 'Var2', 'Var3', 'Var4', 'label'])
dmy_df

In [None]:
dmy_df.dtypes

Fix the columns dtypes:

In [None]:
dmy_df['subject_id'] = dmy_df['subject_id'].astype(int)
dmy_df['ts'] = dmy_df['ts'].astype(int)
dmy_df['Var0'] = dmy_df['Var0'].astype(int)
dmy_df['Var1'] = dmy_df['Var1'].astype(int)
dmy_df['Var2'] = dmy_df['Var2'].astype(int)
dmy_df['Var3'] = dmy_df['Var3'].astype(int)
dmy_df['Var4'] = dmy_df['Var4'].astype(str)
dmy_df['label'] = dmy_df['label'].astype(int)

In [None]:
dmy_df.dtypes

In [None]:
# List of used features
dmy_cols = list(dmy_df.columns)

# Remove features that aren't used by the model to predict the label
for unused_feature in ['subject_id', 'ts', 'label']:
    dmy_cols.remove(unused_feature)

In [None]:
dmy_cols

## Preparing the dataset

### Encoding categories

Converting the categorical feature `Var4` into a numeric format, so that it can be used by the neural networks and by embedding layers.

Encode each row's categorical value:

In [None]:
dmy_df['Var4'], enum_dict = du.embedding.enum_categorical_feature(dmy_df, feature='Var4')
dmy_df

In [None]:
enum_dict

Join the rows and their categories:

In [None]:
dmy_df = du.embedding.join_categorical_enum(dmy_df, cat_feat='Var4', id_columns=['subject_id', 'ts'])
dmy_df

### Normalizing the features

In [None]:
dmy_df.describe().transpose()

In [None]:
dmy_df.reset_index().columns

In [None]:
dmy_norm_df = du.data_processing.normalize_data(dmy_df.reset_index(), id_columns=['subject_id', 'ts'],
                                                embed_columns=['Var4'], see_progress=False)
dmy_norm_df

In [None]:
dmy_norm_df.describe().transpose()

### Padding

Pad the data so that all sequences have the same length (so that it can be converted to a PyTorch tensor).

In [None]:
padding_value = 999999

In [None]:
seq_len_dict = du.padding.get_sequence_length_dict(dmy_norm_df, id_column='subject_id', ts_column='ts')
seq_len_dict

In [None]:
data = du.padding.dataframe_to_padded_tensor(dmy_norm_df, seq_len_dict=seq_len_dict,
                                             id_column='subject_id', padding_value=padding_value)
data

### Dataset object

In [None]:
dataset = Time_Series_Dataset(data, dmy_norm_df)

### Separating into train and validation sets

Since this notebook is only for experimentation purposes, with a very small dummy dataset, we'll not be using a test set.

Training parameters:

In [None]:
batch_size = 32                                 # Number of patients in a mini batch
n_epochs = 50                                   # Number of epochs
lr = 0.001                                      # Learning rate

Separation in train and validation sets:

In [None]:
# Get the train and validation sets data loaders, which will allow loading batches
train_dataloader, val_dataloader, _ = du.machine_learning.create_train_sets(dataset, test_train_ratio=0, 
                                                                            validation_ratio=0.25,
                                                                            batch_size=4, get_indeces=False)

In [None]:
next(iter(train_dataloader))[0]

In [None]:
next(iter(val_dataloader))[0]

## Models testing

### Vanilla LSTM

#### Creating the model

Model parameters:

In [None]:
n_ids = dmy_norm_df.subject_id.nunique()      # Total number of sequences
n_inputs = len(dmy_norm_df.columns)           # Number of input features
n_hidden = 10                                 # Number of hidden units
n_outputs = 1                                 # Number of outputs
n_layers = 1                                  # Number of LSTM layers
p_dropout = 0.2                               # Probability of dropout

Instantiating the model:

In [None]:
model = Models.VanillaLSTM(n_inputs-3, n_hidden, n_outputs, n_layers, p_dropout)
model

#### Training the model

In [None]:
next(model.parameters())

In [None]:
model = du.deep_learning.train(model, train_dataloader, val_dataloader, seq_len_dict,
                               batch_size=batch_size, n_epochs=n_epochs, lr=lr, model_path='models/',
                               padding_value=padding_value, do_test=False, log_comet_ml=False)

In [None]:
next(model.parameters())

#### Testing the model

In [None]:
output, metrics = du.deep_learning.model_inference(model, seq_len_dict, dataloader=val_dataloader, 
                                                   metrics=['loss', 'accuracy', 'AUC'],
                                                   padding_value=padding_value, output_rounded=False, 
                                                   set_name='test', cols_to_remove=[du.search_explore.find_col_idx(dmy_norm_df, feature)
                                                                                    for feature in ['subject_id', 'ts']])
output

In [None]:
metrics

### LSTM with embedding layers

#### Creating the model

Model parameters:

In [None]:
n_ids = dmy_norm_df.subject_id.nunique()      # Total number of sequences
n_inputs = len(dmy_norm_df.columns)           # Number of input features
n_hidden = 10                                 # Number of hidden units
n_outputs = 1                                 # Number of outputs
n_layers = 1                                  # Number of LSTM layers
p_dropout = 0.2                               # Probability of dropout

Instantiating the model:

In [None]:
len(enum_dict.items())+1

In [None]:
model = Models.VanillaLSTM(n_inputs-3, n_hidden, n_outputs, n_layers, p_dropout,
                           embed_features=du.search_explore.find_col_idx(dmy_norm_df, 'Var4'), num_embeddings=5,
                           embedding_dim=2)
model

#### Training the model

In [None]:
next(model.parameters())

In [None]:
model = du.deep_learning.train(model, train_dataloader, val_dataloader, seq_len_dict,
                               batch_size=batch_size, n_epochs=n_epochs, lr=lr, model_path='models/',
                               padding_value=padding_value, do_test=False, log_comet_ml=False)

In [None]:
next(model.parameters())

#### Testing the model

In [None]:
output, metrics = du.deep_learning.model_inference(model, seq_len_dict, dataloader=val_dataloader, 
                                                   metrics=['loss', 'accuracy', 'AUC'],
                                                   padding_value=padding_value, output_rounded=False, 
                                                   set_name='test', cols_to_remove=[du.search_explore.find_col_idx(dmy_norm_df, feature)
                                                                                    for feature in ['subject_id', 'ts']])
output

In [None]:
metrics

### LSTM with embedding layers and time interval handling

### TLSTM

### DeepCare

### Transformer-XL