# Models dummy tests

Testing models from the project defined classes, including the embedding layers and time intervals handling, on dummy datasets.

## Importing the necessary packages

In [1]:
import pandas as pd                        # Pandas to load the data initially
import numpy as np                         # Mathematical operations package, allowing also for missing values representation
import torch                               # PyTorch for tensor and deep learning operations
import data_utils as du                    # Data science and machine learning relevant methods
import os                                  # os handles directory/workspace changes

In [2]:
du.random_seed

('MT19937', array([2147483648, 1256770234, 2023027596,  646966712, 3955640658,
        3609286275, 3460428425, 2616305958,  298104640, 1379089345,
        3238465535, 4066412055,   20438531,  374122254, 4105707792,
        2173841797, 3097175487, 1303810596,  751552147, 3218156518,
        2659904734, 2409227131,  251342323, 2962663233, 3595250703,
        2355817816, 1429135382, 2709037071, 2896382551, 2317082265,
        1749179562, 1655546810, 4180322766, 2564358155, 1776217414,
        1268574891, 2063757078,  668487674,  161686507, 1562100172,
        1020134090,  971565978, 2604208961,  203694675, 3073636743,
        1552423754, 1603177638,  351777504, 1633410559, 1977544140,
         628210345, 1884716641,   71995294,  566383264, 1448786642,
         214027542, 1188785439, 3726604205, 3730932953, 1903233542,
        1706845640, 2412613327, 2277970351, 3565195159,  358878464,
        1383249184, 2026319583, 3053235915, 3495132103, 2846174471,
        3435798274,   23885755, 28907

In [3]:
du.set_random_seed(42)

In [4]:
du.random_seed

42

In [5]:
du.use_modin

True

In [6]:
du.set_pandas_library('pandas')

In [7]:
du.use_modin

False

In [8]:
import pixiedust                           # Debugging in Jupyter Notebook cells

Pixiedust database opened successfully


In [9]:
# Change to scripts directory
os.chdir('../../scripts')

In [10]:
from Tabular_Dataset import Tabular_Dataset # Dataset class that helps fetching batches of data
import Models                              # Script with all the machine learning model classes

In [11]:
# Change to parent directory (presumably "eICU-mortality-prediction")
os.chdir('..')

## Initializing variables

Data that we'll be using:

In [12]:
dmy_data = np.array([[0, 23, 284, 70, 5, np.nan, 0],
                     [91, 23, 284, 70, 5, 'b', 0],
                     [92, 24, 270, 73, 5, 'b', 0],
                     [93, 22, 290, 71, 5, 'a', 0],
                     [93, 22, 290, 71, 5, 'b', 0],
                     [94, 20, 288, 65, 4, 'a', 1],
                     [94, 20, 288, 65, 4, 'b', 1],
                     [95, 21, 297, 64, 4, 'a', 1],
                     [95, 21, 297, 64, 4, 'b', 1],
                     [95, 21, 297, 64, 4, 'c', 1],
                     [10, 25, 300, 76, 5, 'a', 0],
                     [11, 19, 283, 70, 5, 'c', 0],
                     [12, 19, 306, 59, 5, 'a', 1],
                     [12, 19, 306, 59, 5, 'c', 1],
                     [13, 18, 298, 55, 3, 'c', 1],
                     [20, 20, 250, 70, 5, 'c', 0],
                     [21, 20, 254, 68, 4, 'a', 1],
                     [21, 20, 254, 68, 4, 'c', 1],
                     [22, 19, 244, 70, 3, 'a', 1],
                     [30, 27, 264, 78, 4, 'b', 0],
                     [31, 22, 293, 67, 4, 'b', 1]])

In [13]:
dmy_data

array([['0', '23', '284', '70', '5', 'nan', '0'],
       ['91', '23', '284', '70', '5', 'b', '0'],
       ['92', '24', '270', '73', '5', 'b', '0'],
       ['93', '22', '290', '71', '5', 'a', '0'],
       ['93', '22', '290', '71', '5', 'b', '0'],
       ['94', '20', '288', '65', '4', 'a', '1'],
       ['94', '20', '288', '65', '4', 'b', '1'],
       ['95', '21', '297', '64', '4', 'a', '1'],
       ['95', '21', '297', '64', '4', 'b', '1'],
       ['95', '21', '297', '64', '4', 'c', '1'],
       ['10', '25', '300', '76', '5', 'a', '0'],
       ['11', '19', '283', '70', '5', 'c', '0'],
       ['12', '19', '306', '59', '5', 'a', '1'],
       ['12', '19', '306', '59', '5', 'c', '1'],
       ['13', '18', '298', '55', '3', 'c', '1'],
       ['20', '20', '250', '70', '5', 'c', '0'],
       ['21', '20', '254', '68', '4', 'a', '1'],
       ['21', '20', '254', '68', '4', 'c', '1'],
       ['22', '19', '244', '70', '3', 'a', '1'],
       ['30', '27', '264', '78', '4', 'b', '0'],
       ['31', '22',

In [14]:
dmy_df = pd.DataFrame(dmy_data, columns=['subject_id', 'Var0', 'Var1', 'Var2', 'Var3', 'Var4', 'label'])
dmy_df

Unnamed: 0,subject_id,Var0,Var1,Var2,Var3,Var4,label
0,0,23,284,70,5,,0
1,91,23,284,70,5,b,0
2,92,24,270,73,5,b,0
3,93,22,290,71,5,a,0
4,93,22,290,71,5,b,0
5,94,20,288,65,4,a,1
6,94,20,288,65,4,b,1
7,95,21,297,64,4,a,1
8,95,21,297,64,4,b,1
9,95,21,297,64,4,c,1


In [15]:
dmy_df.dtypes

subject_id    object
Var0          object
Var1          object
Var2          object
Var3          object
Var4          object
label         object
dtype: object

Fix the columns dtypes:

In [16]:
dmy_df['subject_id'] = dmy_df['subject_id'].astype(int)
dmy_df['Var0'] = dmy_df['Var0'].astype(int)
dmy_df['Var1'] = dmy_df['Var1'].astype(int)
dmy_df['Var2'] = dmy_df['Var2'].astype(int)
dmy_df['Var3'] = dmy_df['Var3'].astype(int)
dmy_df['Var4'] = dmy_df['Var4'].astype(str)
dmy_df['label'] = dmy_df['label'].astype(int)

In [17]:
dmy_df.dtypes

subject_id     int64
Var0           int64
Var1           int64
Var2           int64
Var3           int64
Var4          object
label          int64
dtype: object

In [18]:
# List of used features
dmy_cols = list(dmy_df.columns)

# Remove features that aren't used by the model to predict the label
for unused_feature in ['subject_id', 'label']:
    dmy_cols.remove(unused_feature)

In [19]:
dmy_cols

['Var0', 'Var1', 'Var2', 'Var3', 'Var4']

## Preparing the dataset

### Encoding categories

Converting the categorical feature `Var4` into a numeric format, so that it can be used by the neural networks and by embedding layers.

Encode each row's categorical value:

In [20]:
dmy_df['Var4'], enum_dict = du.embedding.enum_categorical_feature(dmy_df, feature='Var4')
dmy_df

Unnamed: 0,subject_id,Var0,Var1,Var2,Var3,Var4,label
0,0,23,284,70,5,0,0
1,91,23,284,70,5,2,0
2,92,24,270,73,5,2,0
3,93,22,290,71,5,3,0
4,93,22,290,71,5,2,0
5,94,20,288,65,4,3,1
6,94,20,288,65,4,2,1
7,95,21,297,64,4,3,1
8,95,21,297,64,4,2,1
9,95,21,297,64,4,4,1


In [21]:
enum_dict

{nan: 0, 'b': 2, 'a': 3, 'c': 4}

Join the rows and their categories:

In [22]:
dmy_df = du.embedding.join_categorical_enum(dmy_df, cat_feat='Var4', id_columns='subject_id')
dmy_df

Concatenating categorical encodings...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Joining continuous features...


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))


Merging features' dataframes...
Done!


Unnamed: 0,subject_id,Var4,Var0,Var1,Var2,Var3,label
0,0,0,23,284,70,5,0
1,10,3,25,300,76,5,0
2,11,4,19,283,70,5,0
3,12,3;4,19,306,59,5,1
4,13,4,18,298,55,3,1
5,20,4,20,250,70,5,0
6,21,3;4,20,254,68,4,1
7,22,3,19,244,70,3,1
8,30,2,27,264,78,4,0
9,31,2,22,293,67,4,1


### Normalizing the features

In [23]:
dmy_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
subject_id,15.0,42.333333,37.884536,0.0,12.5,22.0,91.5,95.0
Var0,15.0,21.466667,2.559762,18.0,19.5,21.0,23.0,27.0
Var1,15.0,280.333333,19.437323,244.0,267.0,284.0,295.0,306.0
Var2,15.0,68.4,5.925731,55.0,66.0,70.0,70.5,78.0
Var3,15.0,4.4,0.736788,3.0,4.0,5.0,5.0,5.0
label,15.0,0.466667,0.516398,0.0,0.0,0.0,1.0,1.0


In [24]:
dmy_norm_df = du.data_processing.normalize_data(dmy_df, id_columns='subject_id',
                                                categ_columns=['Var4', 'label'], see_progress=False)
dmy_norm_df

z-score normalizing columns ['Var0', 'Var1', 'Var2', 'Var3']...


Unnamed: 0,subject_id,Var4,Var0,Var1,Var2,Var3,label
0,0,0,0.599014,0.188641,0.270009,0.814345,0
1,10,3,1.380337,1.011799,1.282542,0.814345,0
2,11,4,-0.963631,0.137193,0.270009,0.814345,0
3,12,3;4,-0.963631,1.320484,-1.586302,0.814345,1
4,13,4,-1.354293,0.908904,-2.261324,-1.900138,1
5,20,4,-0.57297,-1.560572,0.270009,0.814345,0
6,21,3;4,-0.57297,-1.354782,-0.067502,-0.542897,1
7,22,3,-0.963631,-1.869256,0.270009,-1.900138,1
8,30,2,2.161659,-0.840308,1.620053,-0.542897,0
9,31,2,0.208353,0.651667,-0.236258,-0.542897,1


In [25]:
dmy_norm_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
subject_id,15.0,42.33333,37.884536,0.0,12.5,22.0,91.5,95.0
Var0,15.0,6.587323e-16,1.0,-1.354293,-0.768301,-0.182309,0.599014,2.161659
Var1,15.0,9.473903e-16,1.0,-1.869256,-0.685966,0.188641,0.754562,1.320484
Var2,15.0,-9.621933e-16,1.0,-2.261324,-0.405013,0.270009,0.354387,1.620053
Var3,15.0,-4.884981e-16,1.0,-1.900138,-0.542897,0.814345,0.814345,0.814345
label,15.0,0.4666667,0.516398,0.0,0.0,0.0,1.0,1.0


### Converting string encodings to numeric

In [26]:
dmy_norm_df = du.embedding.string_encod_to_numeric(dmy_norm_df, cat_feat='Var4', inplace=True)
dmy_norm_df

Unnamed: 0,subject_id,Var4,Var0,Var1,Var2,Var3,label
0,0,0.0,0.599014,0.188641,0.270009,0.814345,0
1,10,3.0,1.380337,1.011799,1.282542,0.814345,0
2,11,4.0,-0.963631,0.137193,0.270009,0.814345,0
3,12,304.0,-0.963631,1.320484,-1.586302,0.814345,1
4,13,4.0,-1.354293,0.908904,-2.261324,-1.900138,1
5,20,4.0,-0.57297,-1.560572,0.270009,0.814345,0
6,21,304.0,-0.57297,-1.354782,-0.067502,-0.542897,1
7,22,3.0,-0.963631,-1.869256,0.270009,-1.900138,1
8,30,2.0,2.161659,-0.840308,1.620053,-0.542897,0
9,31,2.0,0.208353,0.651667,-0.236258,-0.542897,1


### Dataset object

In [27]:
dataset = Tabular_Dataset(dmy_norm_df.to_numpy(), dmy_norm_df)

### Separating into train and validation sets

Since this notebook is only for experimentation purposes, with a very small dummy dataset, we'll not be using a test set.

Training parameters:

In [28]:
batch_size = 32                                 # Number of patients in a mini batch
n_epochs = 50                                   # Number of epochs
lr = 0.001                                      # Learning rate

Separation in train and validation sets:

In [29]:
# Get the train and validation sets data loaders, which will allow loading batches
train_dataloader, val_dataloader, _ = du.machine_learning.create_train_sets(dataset, test_train_ratio=0, 
                                                                            validation_ratio=0.25,
                                                                            batch_size=4, get_indeces=False)

In [30]:
next(iter(train_dataloader))[0]

tensor([[ 9.4000e+01,  3.0200e+02, -5.7297e-01,  3.9443e-01, -5.7377e-01,
         -5.4290e-01],
        [ 9.5000e+01,  3.0204e+04, -1.8231e-01,  8.5746e-01, -7.4252e-01,
         -5.4290e-01],
        [ 3.0000e+01,  2.0000e+00,  2.1617e+00, -8.4031e-01,  1.6201e+00,
         -5.4290e-01],
        [ 1.2000e+01,  3.0400e+02, -9.6363e-01,  1.3205e+00, -1.5863e+00,
          8.1435e-01]], dtype=torch.float64)

In [31]:
next(iter(val_dataloader))[0]

tensor([[ 9.3000e+01,  3.0200e+02,  2.0835e-01,  4.9733e-01,  4.3876e-01,
          8.1435e-01],
        [ 3.1000e+01,  2.0000e+00,  2.0835e-01,  6.5167e-01, -2.3626e-01,
         -5.4290e-01],
        [ 2.2000e+01,  3.0000e+00, -9.6363e-01, -1.8693e+00,  2.7001e-01,
         -1.9001e+00]], dtype=torch.float64)

## Models testing

### MLP

#### Creating the model

Model parameters:

In [32]:
n_ids = dmy_norm_df.subject_id.nunique()      # Total number of sequences
n_inputs = len(dmy_norm_df.columns)           # Number of input features
n_hidden = 10                                 # Number of hidden units
n_outputs = 1                                 # Number of outputs
n_layers = 2                                  # Number of MLP layers
p_dropout = 0.2                               # Probability of dropout
use_batch_norm = False                        # Indicates if batch normalization is applied

Instantiating the model:

In [33]:
model = Models.MLP(n_inputs-2, n_hidden, n_outputs, n_layers, p_dropout, use_batch_norm)
model

MLP(
  (linear_layers): ModuleList(
    (0): Linear(in_features=5, out_features=10, bias=True)
    (1): Linear(in_features=10, out_features=1, bias=True)
  )
  (dropout): Dropout(p=0.2, inplace=False)
)

#### Training the model

In [34]:
next(model.parameters())

Parameter containing:
tensor([[ 0.0661, -0.2088,  0.1140, -0.2060, -0.0524],
        [-0.1816,  0.2967, -0.3530, -0.2062, -0.1263],
        [-0.2689,  0.0422, -0.4417,  0.4039, -0.3799],
        [ 0.3453,  0.0744, -0.1452,  0.2764,  0.0697],
        [ 0.3613,  0.0489, -0.1410,  0.1202, -0.1213],
        [ 0.1882,  0.3993,  0.2585, -0.1955,  0.2582],
        [ 0.0800,  0.2271, -0.2726, -0.4427, -0.1728],
        [-0.3430,  0.3670,  0.1288,  0.1852,  0.1414],
        [-0.0078,  0.3500, -0.3178,  0.0282, -0.3052],
        [ 0.1379, -0.1540,  0.1370, -0.0932,  0.3709]], requires_grad=True)

In [35]:
model = du.deep_learning.train(model, train_dataloader, val_dataloader, cols_to_remove=0,
                               model_type='mlp', batch_size=batch_size, n_epochs=n_epochs, 
                               lr=lr, model_path='models/', do_test=False, log_comet_ml=False)

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
next(model.parameters())

#### Testing the model

In [None]:
output, metrics = du.deep_learning.model_inference(model, seq_len_dict, dataloader=val_dataloader, 
                                                   model_type='mlp', metrics=['loss', 'accuracy', 'AUC'],
                                                   output_rounded=False, set_name='test', 
                                                   cols_to_remove=du.search_explore.find_col_idx(dmy_norm_df, 'subject_id'))
output

In [None]:
metrics

### MLP with embedding layers

#### Creating the model

Model parameters:

In [None]:
n_ids = dmy_norm_df.subject_id.nunique()      # Total number of sequences
n_inputs = len(dmy_norm_df.columns)           # Number of input features
n_hidden = 10                                 # Number of hidden units
n_outputs = 1                                 # Number of outputs
n_layers = 1                                  # Number of LSTM layers
p_dropout = 0.2                               # Probability of dropout
use_batch_norm = False                        # Indicates if batch normalization is applied

Instantiating the model:

In [None]:
len(enum_dict.items())+1

In [None]:
model = Models.MLP(n_inputs-2, n_hidden, n_outputs, n_layers, p_dropout, use_batch_norm,
                   embed_features=du.search_explore.find_col_idx(dmy_norm_df, 'Var4'), num_embeddings=5,
                   embedding_dim=2)
model

#### Training the model

In [None]:
next(model.parameters())

In [None]:
model = du.deep_learning.train(model, train_dataloader, val_dataloader, cols_to_remove=0,
                               model_type='mlp', batch_size=batch_size, n_epochs=n_epochs, 
                               lr=lr, model_path='models/', do_test=False, log_comet_ml=False)

In [None]:
next(model.parameters())

#### Testing the model

In [None]:
output, metrics = du.deep_learning.model_inference(model, seq_len_dict, dataloader=val_dataloader, 
                                                   model_type='mlp', metrics=['loss', 'accuracy', 'AUC'],
                                                   output_rounded=False, set_name='test', 
                                                   cols_to_remove=du.search_explore.find_col_idx(dmy_norm_df, 'subject_id'))
output

In [None]:
metrics

### Regularization Learning Network (RLN)

### SVM

### Decision tree

### XGBoost