# Models dummy tests
---

Testing models from the project defined classes, including the embedding layers and time intervals handling, on dummy datasets.

## Importing the necessary packages

In [1]:
import comet_ml                            # Comet.ml can log training metrics, parameters, do version control and parameter optimization
import os                                  # os handles directory/workspace changes
import pandas as pd                        # Pandas to load the data initially
# import modin.pandas as pd                  # Optimized distributed version of Pandas
import numpy as np                         # Mathematical operations package, allowing also for missing values representation
import torch                               # PyTorch for tensor and deep learning operations
import plotly.graph_objs as go             # Plotly for interactive and pretty plots
import data_utils as du                    # Data science and machine learning relevant methods
from model_interpreter.model_interpreter import ModelInterpreter  # Model interpretability class
import shap                                # Model-agnostic interpretability package inspired on Shapley values

In [2]:
du.random_seed

('MT19937',
 array([2147483648, 3591511506,  574023892, 2158020808, 1830528619,
        1650946135,  940851398, 1302755870, 1495656709,  761602821,
         262900457, 3397612683, 2385523360, 3736147681, 3442740230,
          60512451, 3063499481, 1991615541, 3549958074, 1647251719,
         959996168, 3686436682,  489042740,  646843638, 3968238950,
        1062261360, 2081747606, 4233830547, 3564567078, 3289275422,
        2485877337,  194726275, 3099843011, 3848219423,  452599704,
          35102207, 1225815776, 1623452787,  379192776, 1414193010,
        1563263885, 3410291620,  927725705, 1630697658, 1273846013,
        2933701644, 2868429130, 2441921364, 1683887774, 1764351017,
          77879814, 3877176241, 3757311812, 4046020185,  376913637,
        1391882510,  234700701, 4070141241,  967735468, 2270869778,
          68103903,  208773857, 2287073443, 2466674846, 1973045786,
        3375962617, 1529789644, 3371022725, 3980440368, 1930641443,
        1218127177, 2208524059, 3229

In [3]:
du.set_random_seed(42)

In [4]:
du.random_seed

42

In [5]:
du.set_pandas_library(lib='pandas')

In [6]:
import pixiedust                           # Debugging in Jupyter Notebook cells

Pixiedust database opened successfully


In [7]:
# Change to scripts directory
os.chdir('../../scripts')

In [8]:
import Models                              # Script with all the machine learning model classes

In [9]:
# Change to parent directory (presumably "eICU-mortality-prediction")
os.chdir('..')

## Initializing variables

Comet ML settings:

In [None]:
comet_ml_project_name = input('Comet ML project name:')
comet_ml_workspace = input('Comet ML workspace:')
comet_ml_api_key = getpass.getpass('Comet ML API key')

Data that we'll be using:

In [10]:
dmy_data = np.array([[0, 0, 23, 284, 70, 5, np.nan, 0],
                     [0, 1, 23, 284, 70, 5, 'b', 0],
                     [0, 2, 24, 270, 73, 5, 'b', 0],
                     [0, 3, 22, 290, 71, 5, 'a', 0],
                     [0, 3, 22, 290, 71, 5, 'b', 0],
                     [0, 4, 20, 288, 65, 4, 'a', 1],
                     [0, 4, 20, 288, 65, 4, 'b', 1],
                     [0, 5, 21, 297, 64, 4, 'a', 1],
                     [0, 5, 21, 297, 64, 4, 'b', 1],
                     [0, 5, 21, 297, 64, 4, 'c', 1],
                     [1, 0, 25, 300, 76, 5, 'a', 0],
                     [1, 1, 19, 283, 70, 5, 'c', 0],
                     [1, 2, 19, 306, 59, 5, 'a', 1],
                     [1, 2, 19, 306, 59, 5, 'c', 1],
                     [1, 3, 18, 298, 55, 3, 'c', 1],
                     [2, 0, 20, 250, 70, 5, 'c', 0],
                     [2, 1, 20, 254, 68, 4, 'a', 1],
                     [2, 1, 20, 254, 68, 4, 'c', 1],
                     [2, 2, 19, 244, 70, 3, 'a', 1],
                     [3, 0, 27, 264, 78, 4, 'b', 0],
                     [3, 1, 22, 293, 67, 4, 'b', 1],
                     [4, 0, 28, 290, 73, 5, 'b', 0],
                     [4, 1, 29, 288, 75, 5, 'b', 0],
                     [4, 2, 28, 289, 75, 5, 'b', 0],
                     [4, 5, 26, 290, 62, 5, 'b', 0],
                     [4, 6, 25, 285, 63, 4, 'b', 0],
                     [4, 12, 23, 280, 58, 4, 'b', 0],
                     [4, 12, 23, 280, 58, 4, 'c', 0],
                     [4, 14, 21, 282, 59, 3, 'a', 0],
                     [4, 14, 21, 282, 59, 3, 'b', 0],
                     [4, 14, 21, 282, 59, 3, 'c', 0],
                     [4, 15, 22, 277, 56, 2, 'a', 1],
                     [4, 16, 20, 270, 53, 2, 'a', 1],])

In [11]:
dmy_data

array([['0', '0', '23', '284', '70', '5', 'nan', '0'],
       ['0', '1', '23', '284', '70', '5', 'b', '0'],
       ['0', '2', '24', '270', '73', '5', 'b', '0'],
       ['0', '3', '22', '290', '71', '5', 'a', '0'],
       ['0', '3', '22', '290', '71', '5', 'b', '0'],
       ['0', '4', '20', '288', '65', '4', 'a', '1'],
       ['0', '4', '20', '288', '65', '4', 'b', '1'],
       ['0', '5', '21', '297', '64', '4', 'a', '1'],
       ['0', '5', '21', '297', '64', '4', 'b', '1'],
       ['0', '5', '21', '297', '64', '4', 'c', '1'],
       ['1', '0', '25', '300', '76', '5', 'a', '0'],
       ['1', '1', '19', '283', '70', '5', 'c', '0'],
       ['1', '2', '19', '306', '59', '5', 'a', '1'],
       ['1', '2', '19', '306', '59', '5', 'c', '1'],
       ['1', '3', '18', '298', '55', '3', 'c', '1'],
       ['2', '0', '20', '250', '70', '5', 'c', '0'],
       ['2', '1', '20', '254', '68', '4', 'a', '1'],
       ['2', '1', '20', '254', '68', '4', 'c', '1'],
       ['2', '2', '19', '244', '70', '3', 'a

In [12]:
dmy_df = pd.DataFrame(dmy_data, columns=['subject_id', 'ts', 'Var0', 'Var1', 'Var2', 'Var3', 'Var4', 'label'])
dmy_df

Unnamed: 0,subject_id,ts,Var0,Var1,Var2,Var3,Var4,label
0,0,0,23,284,70,5,,0
1,0,1,23,284,70,5,b,0
2,0,2,24,270,73,5,b,0
3,0,3,22,290,71,5,a,0
4,0,3,22,290,71,5,b,0
5,0,4,20,288,65,4,a,1
6,0,4,20,288,65,4,b,1
7,0,5,21,297,64,4,a,1
8,0,5,21,297,64,4,b,1
9,0,5,21,297,64,4,c,1


In [13]:
dmy_df.dtypes

subject_id    object
ts            object
Var0          object
Var1          object
Var2          object
Var3          object
Var4          object
label         object
dtype: object

Fix the columns dtypes:

In [14]:
dmy_df['subject_id'] = dmy_df['subject_id'].astype(int)
dmy_df['ts'] = dmy_df['ts'].astype(int)
dmy_df['Var0'] = dmy_df['Var0'].astype(int)
dmy_df['Var1'] = dmy_df['Var1'].astype(int)
dmy_df['Var2'] = dmy_df['Var2'].astype(int)
dmy_df['Var3'] = dmy_df['Var3'].astype(int)
dmy_df['Var4'] = dmy_df['Var4'].astype(str)
dmy_df['label'] = dmy_df['label'].astype(int)

In [15]:
dmy_df.dtypes

subject_id     int64
ts             int64
Var0           int64
Var1           int64
Var2           int64
Var3           int64
Var4          object
label          int64
dtype: object

In [16]:
# List of used features
dmy_cols = list(dmy_df.columns)
# Remove features that aren't used by the model to predict the label
for unused_feature in ['subject_id', 'ts', 'label']:
    dmy_cols.remove(unused_feature)

In [17]:
dmy_cols

In [18]:
dmy_df.index

RangeIndex(start=0, stop=33, step=1)

In [19]:
dmy_df['subject_id'] == 0

0      True
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
Name: subject_id, dtype: bool

In [20]:
dmy_df.index[dmy_df['subject_id'] == 4]

Int64Index([21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32], dtype='int64')

In [21]:
dmy_df.iloc[dmy_df.index[dmy_df['subject_id'] == 4]]

Unnamed: 0,subject_id,ts,Var0,Var1,Var2,Var3,Var4,label
21,4,0,28,290,73,5,b,0
22,4,1,29,288,75,5,b,0
23,4,2,28,289,75,5,b,0
24,4,5,26,290,62,5,b,0
25,4,6,25,285,63,4,b,0
26,4,12,23,280,58,4,b,0
27,4,12,23,280,58,4,c,0
28,4,14,21,282,59,3,a,0
29,4,14,21,282,59,3,b,0
30,4,14,21,282,59,3,c,0


In [22]:
dmy_df.set_index(['subject_id', 'ts'], inplace=True)

In [23]:
type(dmy_df)

pandas.core.frame.DataFrame

In [24]:
dmy_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Var0,Var1,Var2,Var3,Var4,label
subject_id,ts,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,23,284,70,5,,0
0,1,23,284,70,5,b,0
0,2,24,270,73,5,b,0
0,3,22,290,71,5,a,0
0,3,22,290,71,5,b,0
0,4,20,288,65,4,a,1
0,4,20,288,65,4,b,1
0,5,21,297,64,4,a,1
0,5,21,297,64,4,b,1
0,5,21,297,64,4,c,1


In [25]:
dmy_df.index

MultiIndex([(0,  0),
            (0,  1),
            (0,  2),
            (0,  3),
            (0,  3),
            (0,  4),
            (0,  4),
            (0,  5),
            (0,  5),
            (0,  5),
            (1,  0),
            (1,  1),
            (1,  2),
            (1,  2),
            (1,  3),
            (2,  0),
            (2,  1),
            (2,  1),
            (2,  2),
            (3,  0),
            (3,  1),
            (4,  0),
            (4,  1),
            (4,  2),
            (4,  5),
            (4,  6),
            (4, 12),
            (4, 12),
            (4, 14),
            (4, 14),
            (4, 14),
            (4, 15),
            (4, 16)],
           names=['subject_id', 'ts'])

Define if the notebook will run hyperparameter optimization on each model:

In [26]:
do_hyperparam_optim = False

## Preparing the dataset

### Encoding categories

Converting the categorical feature `Var4` into one hot encoded columns, so that it can be used by the neural networks and by embedding layers.

~Encode each row's categorical value:~

One hot encode the categorical feature:

In [27]:
# dmy_df['Var4'], enum_dict = du.embedding.enum_categorical_feature(dmy_df, feature='Var4',
#                                                                   nan_value=0, forbidden_digit=0)
# dmy_df

In [28]:
%%time
x1 = pd.get_dummies(dmy_df, columns=['Var4'])
x1.head()

CPU times: user 4.89 ms, sys: 1.54 ms, total: 6.44 ms
Wall time: 7.02 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,Var0,Var1,Var2,Var3,label,Var4_a,Var4_b,Var4_c,Var4_nan
subject_id,ts,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,23,284,70,5,0,0,0,0,1
0,1,23,284,70,5,0,0,1,0,0
0,2,24,270,73,5,0,0,1,0,0
0,3,22,290,71,5,0,1,0,0,0
0,3,22,290,71,5,0,0,1,0,0


In [29]:
x1.dtypes

Var0        int64
Var1        int64
Var2        int64
Var3        int64
label       int64
Var4_a      uint8
Var4_b      uint8
Var4_c      uint8
Var4_nan    uint8
dtype: object

In [30]:
%%time
x2 = pd.get_dummies(dmy_df, columns=['Var4'], sparse=True)
x2.head()

CPU times: user 7.66 ms, sys: 2.12 ms, total: 9.79 ms
Wall time: 12.8 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,Var0,Var1,Var2,Var3,label,Var4_a,Var4_b,Var4_c,Var4_nan
subject_id,ts,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,23,284,70,5,0,0,0,0,1
0,1,23,284,70,5,0,0,1,0,0
0,2,24,270,73,5,0,0,1,0,0
0,3,22,290,71,5,0,1,0,0,0
0,3,22,290,71,5,0,0,1,0,0


In [31]:
x2.dtypes

Var0                   int64
Var1                   int64
Var2                   int64
Var3                   int64
label                  int64
Var4_a      Sparse[uint8, 0]
Var4_b      Sparse[uint8, 0]
Var4_c      Sparse[uint8, 0]
Var4_nan    Sparse[uint8, 0]
dtype: object

In [32]:
x2.values

array([[23, 284, 70, 5, 0, 0, 0, 0, 1],
       [23, 284, 70, 5, 0, 0, 1, 0, 0],
       [24, 270, 73, 5, 0, 0, 1, 0, 0],
       [22, 290, 71, 5, 0, 1, 0, 0, 0],
       [22, 290, 71, 5, 0, 0, 1, 0, 0],
       [20, 288, 65, 4, 1, 1, 0, 0, 0],
       [20, 288, 65, 4, 1, 0, 1, 0, 0],
       [21, 297, 64, 4, 1, 1, 0, 0, 0],
       [21, 297, 64, 4, 1, 0, 1, 0, 0],
       [21, 297, 64, 4, 1, 0, 0, 1, 0],
       [25, 300, 76, 5, 0, 1, 0, 0, 0],
       [19, 283, 70, 5, 0, 0, 0, 1, 0],
       [19, 306, 59, 5, 1, 1, 0, 0, 0],
       [19, 306, 59, 5, 1, 0, 0, 1, 0],
       [18, 298, 55, 3, 1, 0, 0, 1, 0],
       [20, 250, 70, 5, 0, 0, 0, 1, 0],
       [20, 254, 68, 4, 1, 1, 0, 0, 0],
       [20, 254, 68, 4, 1, 0, 0, 1, 0],
       [19, 244, 70, 3, 1, 1, 0, 0, 0],
       [27, 264, 78, 4, 0, 0, 1, 0, 0],
       [22, 293, 67, 4, 1, 0, 1, 0, 0],
       [28, 290, 73, 5, 0, 0, 1, 0, 0],
       [29, 288, 75, 5, 0, 0, 1, 0, 0],
       [28, 289, 75, 5, 0, 0, 1, 0, 0],
       [26, 290, 62, 5, 0, 0, 1, 0, 0],


In [33]:
dmy_df, ohe_columns = du.data_processing.one_hot_encoding_dataframe(dmy_df, columns='Var4', 
                                                                    join_rows=False, 
                                                                    get_new_column_names=True, 
                                                                    inplace=True)
dmy_df

Cleaning the categorical columns...


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Getting dummies...
Done!


Unnamed: 0_level_0,Unnamed: 1_level_0,Var0,Var1,Var2,Var3,label,Var4_a,Var4_b,Var4_c
subject_id,ts,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,23,284,70,5,0,0,0,0
0,1,23,284,70,5,0,0,1,0
0,2,24,270,73,5,0,0,1,0
0,3,22,290,71,5,0,1,0,0
0,3,22,290,71,5,0,0,1,0
0,4,20,288,65,4,1,1,0,0
0,4,20,288,65,4,1,0,1,0
0,5,21,297,64,4,1,1,0,0
0,5,21,297,64,4,1,0,1,0
0,5,21,297,64,4,1,0,0,1


In [34]:
ohe_columns

### Joining the rows that have the same identifiers

In [35]:
dmy_df = du.embedding.join_repeated_rows(dmy_df, id_columns=['subject_id', 'ts'])
dmy_df

Joining boolean features...
Joining continuous features...


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


Merging features' dataframes...
Done!


Unnamed: 0,subject_id,ts,Var4_a,Var4_b,Var4_c,label,Var0,Var1,Var2,Var3
0,0,0,0,0,0,0,23,284,70,5
1,0,1,0,1,0,0,23,284,70,5
2,0,2,0,1,0,0,24,270,73,5
3,0,3,1,1,0,0,22,290,71,5
4,0,4,1,1,0,1,20,288,65,4
5,0,5,1,1,1,1,21,297,64,4
6,1,0,1,0,0,0,25,300,76,5
7,1,1,0,0,1,0,19,283,70,5
8,1,2,1,0,1,1,19,306,59,5
9,1,3,0,0,1,1,18,298,55,3


In [36]:
dmy_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24 entries, 0 to 23
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   subject_id  24 non-null     int64
 1   ts          24 non-null     int64
 2   Var4_a      24 non-null     UInt8
 3   Var4_b      24 non-null     UInt8
 4   Var4_c      24 non-null     UInt8
 5   label       24 non-null     UInt8
 6   Var0        24 non-null     int64
 7   Var1        24 non-null     int64
 8   Var2        24 non-null     int64
 9   Var3        24 non-null     int64
dtypes: UInt8(4), int64(6)
memory usage: 1.5 KB


In [37]:
# Testing the merge of boolean columns
tmp_df = dmy_df.rename(columns={'Var4_a': 'Var4_x', 'Var4_b': 'Var4_y'})
tmp_df.head()

Unnamed: 0,subject_id,ts,Var4_x,Var4_y,Var4_c,label,Var0,Var1,Var2,Var3
0,0,0,0,0,0,0,23,284,70,5
1,0,1,0,1,0,0,23,284,70,5
2,0,2,0,1,0,0,24,270,73,5
3,0,3,1,1,0,0,22,290,71,5
4,0,4,1,1,0,1,20,288,65,4


In [38]:
du.data_processing.merge_columns(tmp_df, cols_to_merge='Var4')

Merging the duplicate columns...


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Removing old columns...


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Done!


Unnamed: 0,subject_id,ts,Var4_c,label,Var0,Var1,Var2,Var3,Var4
0,0,0,0,0,23,284,70,5,0
1,0,1,0,0,23,284,70,5,1
2,0,2,0,0,24,270,73,5,1
3,0,3,0,0,22,290,71,5,1
4,0,4,0,1,20,288,65,4,1
5,0,5,1,1,21,297,64,4,1
6,1,0,0,0,25,300,76,5,1
7,1,1,1,0,19,283,70,5,0
8,1,2,1,1,19,306,59,5,1
9,1,3,1,1,18,298,55,3,0


### Normalizing the features

In [39]:
dmy_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
subject_id,24.0,2.166667,1.685402,0.0,0.75,2.0,4.0,4.0
ts,24.0,4.0,5.013027,0.0,1.0,2.0,5.0,16.0
Var4_a,24.0,0.416667,0.50361,0.0,0.0,0.0,1.0,1.0
Var4_b,24.0,0.583333,0.50361,0.0,0.0,1.0,1.0,1.0
Var4_c,24.0,0.333333,0.481543,0.0,0.0,0.0,1.0,1.0
label,24.0,0.375,0.494535,0.0,0.0,0.0,1.0,1.0
Var0,24.0,22.666667,3.212295,18.0,20.0,22.0,25.0,29.0
Var1,24.0,281.5,15.770087,244.0,275.25,284.5,290.0,306.0
Var2,24.0,66.666667,7.178874,53.0,61.25,69.0,71.5,78.0
Var3,24.0,4.208333,0.977093,2.0,4.0,4.5,5.0,5.0


In [40]:
dmy_norm_df, mean, std = du.data_processing.normalize_data(dmy_df, id_columns=['subject_id', 'ts'],
                                                           see_progress=False, get_stats=True)
dmy_norm_df

z-score normalizing columns ['Var0', 'Var1', 'Var2', 'Var3']...


Unnamed: 0,subject_id,ts,Var4_a,Var4_b,Var4_c,label,Var0,Var1,Var2,Var3
0,0,0,0,0,0,0,0.103768,0.158528,0.464325,0.810227
1,0,1,0,1,0,0,0.103768,0.158528,0.464325,0.810227
2,0,2,0,1,0,0,0.415072,-0.729229,0.882218,0.810227
3,0,3,1,1,0,0,-0.207536,0.538995,0.603623,0.810227
4,0,4,1,1,0,1,-0.830144,0.412173,-0.232163,-0.213218
5,0,5,1,1,1,1,-0.51884,0.982873,-0.37146,-0.213218
6,1,0,1,0,0,0,0.726376,1.173107,1.300111,0.810227
7,1,1,0,0,1,0,-1.141448,0.095117,0.464325,0.810227
8,1,2,1,0,1,1,-1.141448,1.553574,-1.067948,0.810227
9,1,3,0,0,1,1,-1.452751,1.046285,-1.625139,-1.236662


In [41]:
# dmy_norm_df, mean, std = du.data_processing.normalize_data(dmy_df, id_columns=['subject_id', 'ts'],
#                                                            categ_columns=['Var4'], see_progress=False,
#                                                            get_stats=True)
# dmy_norm_df

In [42]:
# dmy_norm_df, mean, std = du.data_processing.normalize_data(dmy_df, id_columns=['subject_id', 'ts'],
#                                                            columns_to_normalize=False,
#                                                            columns_to_normalize_categ=('Var4', ['Var0', 'Var1', 'Var2', 'Var3']), 
#                                                            see_progress=False, get_stats=True)
# dmy_norm_df

In [43]:
# dmy_norm_df, mean, std = du.data_processing.normalize_data(dmy_df, id_columns=['subject_id', 'ts'],
#                                                            columns_to_normalize=False,
#                                                            columns_to_normalize_categ=('Var4', 'Var0'), 
#                                                            see_progress=False, get_stats=True)
# dmy_norm_df

In [44]:
stats = dict()
for key, _ in mean.items():
    stats[key] = dict()
    stats[key]['mean'] = mean[key]
    stats[key]['std'] = std[key]
stats

{'Var0': {'mean': 22.666666666666668, 'std': 3.2122952198871983},
 'Var1': {'mean': 281.5, 'std': 15.770087259552911},
 'Var2': {'mean': 66.66666666666667, 'std': 7.178873998341103},
 'Var3': {'mean': 4.208333333333333, 'std': 0.9770927002733857}}

In [45]:
dmy_norm_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
subject_id,24.0,2.166667,1.685402,0.0,0.75,2.0,4.0,4.0
ts,24.0,4.0,5.013027,0.0,1.0,2.0,5.0,16.0
Var4_a,24.0,0.4166667,0.50361,0.0,0.0,0.0,1.0,1.0
Var4_b,24.0,0.5833333,0.50361,0.0,0.0,1.0,1.0,1.0
Var4_c,24.0,0.3333333,0.481543,0.0,0.0,0.0,1.0,1.0
label,24.0,0.375,0.494535,0.0,0.0,0.0,1.0,1.0
Var0,24.0,-3.700743e-16,1.0,-1.452751,-0.830144,-0.207536,0.726376,1.971591
Var1,24.0,9.251859000000001e-18,1.0,-2.37792,-0.39632,0.190234,0.538995,1.553574
Var2,24.0,-6.661338e-16,1.0,-1.903734,-0.754529,0.325028,0.673272,1.578706
Var3,24.0,2.960595e-16,1.0,-2.260106,-0.213218,0.298505,0.810227,0.810227


### Padding

Pad the data so that all sequences have the same length (so that it can be converted to a PyTorch tensor).

In [46]:
padding_value = 999999

In [47]:
seq_len_dict = du.padding.get_sequence_length_dict(dmy_norm_df, id_column='subject_id', ts_column='ts')
seq_len_dict

{0: 6, 1: 4, 2: 3, 3: 2, 4: 9}

In [48]:
data = du.padding.dataframe_to_padded_tensor(dmy_norm_df, seq_len_dict=seq_len_dict,
                                             id_column='subject_id', padding_value=padding_value)
data

tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  1.0377e-01,  1.5853e-01,  4.6433e-01,  8.1023e-01],
         [ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           0.0000e+00,  1.0377e-01,  1.5853e-01,  4.6433e-01,  8.1023e-01],
         [ 0.0000e+00,  2.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           0.0000e+00,  4.1507e-01, -7.2923e-01,  8.8222e-01,  8.1023e-01],
         [ 0.0000e+00,  3.0000e+00,  1.0000e+00,  1.0000e+00,  0.0000e+00,
           0.0000e+00, -2.0754e-01,  5.3900e-01,  6.0362e-01,  8.1023e-01],
         [ 0.0000e+00,  4.0000e+00,  1.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00, -8.3014e-01,  4.1217e-01, -2.3216e-01, -2.1322e-01],
         [ 0.0000e+00,  5.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,
           1.0000e+00, -5.1884e-01,  9.8287e-01, -3.7146e-01, -2.1322e-01],
         [ 1.0000e+06,  1.0000e+06,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           1.0000e+

In [49]:
data.shape

torch.Size([5, 9, 10])

In [50]:
data[0]

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  1.0377e-01,  1.5853e-01,  4.6433e-01,  8.1023e-01],
        [ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00,  1.0377e-01,  1.5853e-01,  4.6433e-01,  8.1023e-01],
        [ 0.0000e+00,  2.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00,  4.1507e-01, -7.2923e-01,  8.8222e-01,  8.1023e-01],
        [ 0.0000e+00,  3.0000e+00,  1.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00, -2.0754e-01,  5.3900e-01,  6.0362e-01,  8.1023e-01],
        [ 0.0000e+00,  4.0000e+00,  1.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00, -8.3014e-01,  4.1217e-01, -2.3216e-01, -2.1322e-01],
        [ 0.0000e+00,  5.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,
          1.0000e+00, -5.1884e-01,  9.8287e-01, -3.7146e-01, -2.1322e-01],
        [ 1.0000e+06,  1.0000e+06,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          1.0000e+06,  1.0000e+0

In [51]:
data_perm = data.permute(1, 0, 2)
data_perm

tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  1.0377e-01,  1.5853e-01,  4.6433e-01,  8.1023e-01],
         [ 1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  7.2638e-01,  1.1731e+00,  1.3001e+00,  8.1023e-01],
         [ 2.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,
           0.0000e+00, -8.3014e-01, -1.9975e+00,  4.6433e-01,  8.1023e-01],
         [ 3.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           0.0000e+00,  1.3490e+00, -1.1097e+00,  1.5787e+00, -2.1322e-01],
         [ 4.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           0.0000e+00,  1.6603e+00,  5.3900e-01,  8.8222e-01,  8.1023e-01]],

        [[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           0.0000e+00,  1.0377e-01,  1.5853e-01,  4.6433e-01,  8.1023e-01],
         [ 1.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,
           0.0000

In [52]:
data_perm.shape

torch.Size([9, 5, 10])

In [53]:
data_perm[0]

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.1038,  0.1585,
          0.4643,  0.8102],
        [ 1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.7264,  1.1731,
          1.3001,  0.8102],
        [ 2.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000, -0.8301, -1.9975,
          0.4643,  0.8102],
        [ 3.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.3490, -1.1097,
          1.5787, -0.2132],
        [ 4.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.6603,  0.5390,
          0.8822,  0.8102]], dtype=torch.float64)

### Dataset object

In [54]:
dataset = du.datasets.Time_Series_Dataset(dmy_norm_df, data)

### Separating into train and validation sets

Since this notebook is only for experimentation purposes, with a very small dummy dataset, we'll not be using a test set.

Training parameters:

In [55]:
batch_size = 32                                 # Number of patients in a mini batch
n_epochs = 100                                  # Number of epochs
lr = 0.001                                      # Learning rate

Separation in train and validation sets:

In [59]:
# Get the train and validation sets data loaders, which will allow loading batches
train_dataloader, val_dataloader, _ = du.machine_learning.create_train_sets(dataset, test_train_ratio=0, 
                                                                            validation_ratio=0.25,
                                                                            batch_size=4, get_indices=False)

DEBUG: Now inside the `create_train_sets` method. Relevant inputs:
              
train_indices is None? True
              
val_indices is None? True
              
test_indices is None? True
              
test_train_ratio = 0
              
validation_ratio = 0.25


In [60]:
next(iter(train_dataloader))[0]

tensor([[[ 4.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.6603e+00,  5.3900e-01,  8.8222e-01,  8.1023e-01],
         [ 4.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.9716e+00,  4.1217e-01,  1.1608e+00,  8.1023e-01],
         [ 4.0000e+00,  2.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.6603e+00,  4.7558e-01,  1.1608e+00,  8.1023e-01],
         [ 4.0000e+00,  5.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0377e+00,  5.3900e-01, -6.5006e-01,  8.1023e-01],
         [ 4.0000e+00,  6.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           7.2638e-01,  2.2194e-01, -5.1076e-01, -2.1322e-01],
         [ 4.0000e+00,  1.2000e+01,  0.0000e+00,  1.0000e+00,  1.0000e+00,
           1.0377e-01, -9.5117e-02, -1.2072e+00, -2.1322e-01],
         [ 4.0000e+00,  1.4000e+01,  1.0000e+00,  1.0000e+00,  1.0000e+00,
          -5.1884e-01,  3.1706e-02, -1.0679e+00, -1.2367e+00],
         [ 4.0000e+00,  1.5000e+01

In [61]:
next(iter(val_dataloader))[0]

tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           1.0377e-01,  1.5853e-01,  4.6433e-01,  8.1023e-01],
         [ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0377e-01,  1.5853e-01,  4.6433e-01,  8.1023e-01],
         [ 0.0000e+00,  2.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           4.1507e-01, -7.2923e-01,  8.8222e-01,  8.1023e-01],
         [ 0.0000e+00,  3.0000e+00,  1.0000e+00,  1.0000e+00,  0.0000e+00,
          -2.0754e-01,  5.3900e-01,  6.0362e-01,  8.1023e-01],
         [ 0.0000e+00,  4.0000e+00,  1.0000e+00,  1.0000e+00,  0.0000e+00,
          -8.3014e-01,  4.1217e-01, -2.3216e-01, -2.1322e-01],
         [ 0.0000e+00,  5.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,
          -5.1884e-01,  9.8287e-01, -3.7146e-01, -2.1322e-01],
         [ 1.0000e+06,  1.0000e+06,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           1.0000e+06,  1.0000e+06,  1.0000e+06,  1.0000e+06],
         [ 1.0000e+06,  1.0000e+06

In [62]:
dataset.__len__()

5

## Models testing

### Vanilla LSTM



#### Creating the model

Model parameters:

In [63]:
n_ids = dmy_norm_df.subject_id.nunique()      # Total number of sequences
n_inputs = len(dmy_norm_df.columns)           # Number of input features
n_hidden = 10                                 # Number of hidden units
n_outputs = 1                                 # Number of outputs
n_layers = 2                                  # Number of LSTM layers
p_dropout = 0.2                               # Probability of dropout

Instantiating the model:

In [64]:
model = Models.VanillaLSTM(n_inputs-3, n_hidden, n_outputs, n_layers, p_dropout)
model

VanillaLSTM(
  (lstm): LSTM(7, 10, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=10, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (activation): Sigmoid()
  (criterion): BCEWithLogitsLoss()
)

In [65]:
model.n_outputs

1

In [66]:
model.bidir

False

In [69]:
import os
import tempfile
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp

from torch.nn.parallel import DistributedDataParallel as DDP


def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group
    dist.init_process_group("gloo", rank=rank, world_size=world_size)


def cleanup():
    dist.destroy_process_group()

In [70]:
setup(rank=0, world_size=1)
model_ddp = DDP(model)

In [71]:
model_ddp

DistributedDataParallel(
  (module): VanillaLSTM(
    (lstm): LSTM(7, 10, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=10, out_features=1, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (activation): Sigmoid()
    (criterion): BCEWithLogitsLoss()
  )
)

In [73]:
model_ddp.module.n_outputs

1

In [79]:
model_ddp.parameters()

<generator object Module.parameters at 0x13e936450>

#### Training the model

In [None]:
next(model.parameters())

In [None]:
model = du.deep_learning.train(model, train_dataloader, val_dataloader, seq_len_dict=seq_len_dict,
                               batch_size=batch_size, n_epochs=n_epochs, lr=lr, models_path='models/',
                               padding_value=padding_value, do_test=False, log_comet_ml=False)

In [None]:
next(model.parameters())

#### Testing the model

In [None]:
output, metrics = du.deep_learning.model_inference(model, dataloader=val_dataloader, 
                                                   metrics=['loss', 'accuracy', 'AUC'],
                                                   seq_len_dict=seq_len_dict, padding_value=padding_value, 
                                                   output_rounded=False, set_name='test', 
                                                   cols_to_remove=[du.search_explore.find_col_idx(dmy_norm_df, feature)
                                                                   for feature in ['subject_id', 'ts']])
output

In [None]:
metrics

### LSTM with embedding layers

#### Creating the model

Model parameters:

In [None]:
n_ids = dmy_norm_df.subject_id.nunique()      # Total number of sequences
n_inputs = len(dmy_norm_df.columns)           # Number of input features
n_hidden = 10                                 # Number of hidden units
n_outputs = 1                                 # Number of outputs
n_layers = 2                                  # Number of LSTM layers
p_dropout = 0.2                               # Probability of dropout
embed_features = [du.search_explore.find_col_idx(dmy_norm_df, col) for col in ohe_columns] # Indeces fo the features to be emebedded
embed_features.sort()
embedding_dim = 2                             # Number of outputs of the embedding layer

Instantiating the model:

In [None]:
embed_features

In [None]:
model = Models.VanillaLSTM(n_inputs-3, n_hidden, n_outputs, n_layers, p_dropout,
                           embed_features=embed_features, embedding_dim=embedding_dim)
model

In [None]:
model.n_embeddings

#### Training the model

In [None]:
next(model.lstm.parameters())

In [None]:
next(model.embed_layers.parameters())

In [None]:
# model = du.deep_learning.train(model, train_dataloader_df, val_dataloader_df, seq_len_dict=seq_len_dict,
#                                batch_size=batch_size, n_epochs=n_epochs, lr=lr, models_path='models/',
#                                padding_value=padding_value, do_test=False, log_comet_ml=False,
#                                already_embedded=True)

In [None]:
model = du.deep_learning.train(model, train_dataloader, val_dataloader, seq_len_dict=seq_len_dict,
                               batch_size=batch_size, n_epochs=n_epochs, lr=lr, models_path='models/',
                               padding_value=padding_value, do_test=False, log_comet_ml=False,
                               already_embedded=False)

In [None]:
next(model.lstm.parameters())

In [None]:
next(model.embed_layers.parameters())

#### Testing the model

In [None]:
output, metrics = du.deep_learning.model_inference(model, dataloader=val_dataloader, 
                                                   metrics=['loss', 'accuracy', 'AUC'],
                                                   seq_len_dict=seq_len_dict, padding_value=padding_value, 
                                                   output_rounded=False, set_name='test', 
                                                   already_embedded=False,
                                                   cols_to_remove=[du.search_explore.find_col_idx(dmy_norm_df, feature)
                                                                   for feature in ['subject_id', 'ts']])
output

In [None]:
metrics

### LSTM with embedding layers and time interval handling

#### Adding the time difference feature

In [None]:
dmy_df['delta_ts'] = dmy_df.groupby('subject_id').ts.diff()
dmy_df

#### Normalizing the features

In [None]:
dmy_df.describe().transpose()

In [None]:
dmy_df.dtypes

In [None]:
dmy_norm_df = du.data_processing.normalize_data(dmy_df, id_columns=['subject_id', 'ts'],
                                                see_progress=False)
dmy_norm_df

In [None]:
dmy_norm_df.describe().transpose()

#### Imputation

Replace the missing time difference values with the mean (zero).

In [None]:
dmy_norm_df = du.data_processing.missing_values_imputation(dmy_norm_df, method='zero')
dmy_norm_df

#### Padding

Pad the data so that all sequences have the same length (so that it can be converted to a PyTorch tensor).

In [None]:
padding_value = 999999

In [None]:
seq_len_dict = du.padding.get_sequence_length_dict(dmy_norm_df, id_column='subject_id', ts_column='ts')
seq_len_dict

In [None]:
data = du.padding.dataframe_to_padded_tensor(dmy_norm_df, seq_len_dict=seq_len_dict,
                                             id_column='subject_id', padding_value=padding_value)
data

#### Dataset object

In [None]:
dataset = du.datasets.Time_Series_Dataset(dmy_norm_df, data)

#### Separating into train and validation sets

Since this notebook is only for experimentation purposes, with a very small dummy dataset, we'll not be using a test set.

Training parameters:

In [None]:
batch_size = 32                                 # Number of patients in a mini batch
n_epochs = 100                                  # Number of epochs
lr = 0.001                                      # Learning rate

Separation in train and validation sets:

In [None]:
# Get the train and validation sets data loaders, which will allow loading batches
train_dataloader, val_dataloader, _ = du.machine_learning.create_train_sets(dataset, test_train_ratio=0, 
                                                                            validation_ratio=0.25,
                                                                            batch_size=4, get_indeces=False)

In [None]:
train_features, train_labels = next(iter(train_dataloader))
train_features

In [None]:
val_features, val_labels = next(iter(val_dataloader))
val_features

#### Creating the model

Model parameters:

In [None]:
n_ids = dmy_norm_df.subject_id.nunique()      # Total number of sequences
n_inputs = len(dmy_norm_df.columns)           # Number of input features
n_hidden = 10                                 # Number of hidden units
n_outputs = 1                                 # Number of outputs
n_layers = 2                                  # Number of LSTM layers
p_dropout = 0.2                               # Probability of dropout
embed_features = [du.search_explore.find_col_idx(dmy_norm_df, col) for col in ohe_columns] # Indeces fo the features to be emebedded
embed_features.sort()
embedding_dim = 2                             # Number of outputs of the embedding layer

Instantiating the model:

In [None]:
model = Models.VanillaLSTM(n_inputs-3, n_hidden, n_outputs, n_layers, p_dropout,
                           embed_features=embed_features, embedding_dim=embedding_dim)
model

#### Training the model

In [None]:
next(model.parameters())

In [None]:
next(model.embed_layers.parameters())

In [None]:
model = du.deep_learning.train(model, train_dataloader, val_dataloader, seq_len_dict=seq_len_dict,
                               batch_size=batch_size, n_epochs=n_epochs, lr=lr, models_path='models/',
                               padding_value=padding_value, do_test=False, log_comet_ml=False)

In [None]:
next(model.parameters())

In [None]:
next(model.embed_layers.parameters())

#### Testing the model

In [None]:
output, metrics = du.deep_learning.model_inference(model, dataloader=val_dataloader, 
                                                   metrics=['loss', 'accuracy', 'AUC'],
                                                   seq_len_dict=seq_len_dict, padding_value=padding_value, 
                                                   output_rounded=False, set_name='test', 
                                                   cols_to_remove=[du.search_explore.find_col_idx(dmy_norm_df, feature)
                                                                   for feature in ['subject_id', 'ts']])
output

In [None]:
metrics

### T-LSTM

Implementation of the [_Patient Subtyping via Time-Aware LSTM Networks_](http://biometrics.cse.msu.edu/Publications/MachineLearning/Baytasetal_PatientSubtypingViaTimeAwareLSTMNetworks.pdf) paper.

#### Creating the model

Model parameters:

In [None]:
n_ids = dmy_norm_df.subject_id.nunique()      # Total number of sequences
n_inputs = len(dmy_norm_df.columns)           # Number of input features
n_hidden = 10                                 # Number of hidden units
n_outputs = 1                                 # Number of outputs
n_rnn_layers = 4                              # Number of TLSTM layers
p_dropout = 0.2                               # Probability of dropout
embed_features = [du.search_explore.find_col_idx(dmy_norm_df, col) for col in ohe_columns] # Indeces fo the features to be emebedded
embed_features.sort()
embedding_dim = 2                             # Number of outputs of the embedding layr
# delta_ts_col = du.search_explore.find_col_idx(dmy_norm_df, 'delta_ts')   # Number of the delta_ts column
elapsed_time = 'small'                                                   # Indicates if the elapsed time between events is small or long; influences how to discount elapsed time

In [None]:
n_inputs

In [None]:
dmy_norm_df.columns

In [None]:
embed_features

Instantiating the model:

In [None]:
model = Models.TLSTM(n_inputs-4, n_hidden, n_outputs, n_rnn_layers, p_dropout,
                     embed_features=embed_features, embedding_dim=embedding_dim, 
                     elapsed_time=elapsed_time)
model

In [None]:
model.rnn_layers[0].cell.input_size

In [None]:
model.rnn_layers[0].cell.hidden_size

In [None]:
model.rnn_layers[0].cell.weight_ih.shape

In [None]:
model.rnn_layers[0].cell.delta_ts_col

In [None]:
model.rnn_layers[1].cell.delta_ts_col

#### Training the model

In [None]:
next(model.parameters())

In [None]:
next(model.embed_layers.parameters())

In [None]:
model = du.deep_learning.train(model, train_dataloader, val_dataloader, seq_len_dict=seq_len_dict,
                               batch_size=batch_size, n_epochs=n_epochs, lr=lr, models_path='models/',
                               padding_value=padding_value, do_test=False, log_comet_ml=False,
                               is_custom=True)

In [None]:
next(model.parameters())

In [None]:
next(model.embed_layers.parameters())

#### Testing the model

In [None]:
output, metrics = du.deep_learning.model_inference(model, dataloader=val_dataloader, 
                                                   metrics=['loss', 'accuracy', 'AUC'],
                                                   seq_len_dict=seq_len_dict, padding_value=padding_value, 
                                                   output_rounded=False, set_name='test',
                                                   is_custom=True,
                                                   cols_to_remove=[du.search_explore.find_col_idx(dmy_norm_df, feature)
                                                                   for feature in ['subject_id', 'ts']])
output

In [None]:
metrics

#### Hyperparameter optimization

In [None]:
if do_hyperparam_optim:
    val_loss_min, exp_name_min = du.machine_learning.optimize_hyperparameters(Models.TLSTM, df=dmy_norm_df, 
                                                                              config_name='TLSTM_hyperparameter_optimization_config.yaml', 
                                                                              comet_ml_api_key=comet_ml_api_key,
                                                                              comet_ml_project_name=comet_ml_project_name, 
                                                                              comet_ml_workspace=comet_ml_workspace, 
                                                                              n_inputs=n_inputs-4, id_column='subject_id',  
                                                                              label_column='label', inst_column='ts',
                                                                              n_outputs=1, model_type='multivariate_rnn',
                                                                              is_custom=True, models_path='models/', array_param=None,
                                                                              config_path='notebooks/sandbox/', var_seq=True, 
                                                                              clip_value=0.5, padding_value=padding_value, 
                                                                              batch_size=batch_size, n_epochs=n_epochs,
                                                                              lr=lr, test_train_ratio=0, validation_ratio=0.25,
                                                                              comet_ml_save_model=True, embed_features=embed_features)

In [None]:
if do_hyperparam_optim:
    exp_name_min

### MF1-LSTM

Implementation of the [_Predicting healthcare trajectories from medical records: A deep learning approach_](https://doi.org/10.1016/j.jbi.2017.04.001) paper, time decay version.

#### Creating the model

Model parameters:

In [None]:
n_ids = dmy_norm_df.subject_id.nunique()      # Total number of sequences
n_inputs = len(dmy_norm_df.columns)           # Number of input features
n_hidden = 10                                 # Number of hidden units
n_outputs = 1                                 # Number of outputs
n_rnn_layers = 4                              # Number of TLSTM layers
p_dropout = 0.2                               # Probability of dropout
embed_features = [du.search_explore.find_col_idx(dmy_norm_df, col) for col in ohe_columns] # Indeces fo the features to be emebedded
embed_features.sort()
embedding_dim = 2                             # Number of outputs of the embedding layr
# delta_ts_col = du.search_explore.find_col_idx(dmy_norm_df, 'delta_ts')   # Number of the delta_ts column
elapsed_time = 'small'                                                   # Indicates if the elapsed time between events is small or long; influences how to discount elapsed time

In [None]:
n_inputs

In [None]:
dmy_norm_df.columns

In [None]:
embed_features

Instantiating the model:

In [None]:
model = Models.MF1LSTM(n_inputs-4, n_hidden, n_outputs, n_rnn_layers, p_dropout,
                       embed_features=embed_features, embedding_dim=embedding_dim, 
                       elapsed_time=elapsed_time)
model

In [None]:
model.rnn_layers[0].cell.input_size

In [None]:
model.rnn_layers[0].cell.hidden_size

In [None]:
model.rnn_layers[0].cell.weight_ih.shape

In [None]:
model.rnn_layers[0].cell.delta_ts_col

In [None]:
model.rnn_layers[1].cell.delta_ts_col

#### Training the model

In [None]:
next(model.parameters())

In [None]:
next(model.embed_layers.parameters())

In [None]:
model = du.deep_learning.train(model, train_dataloader, val_dataloader, seq_len_dict=seq_len_dict,
                               batch_size=batch_size, n_epochs=n_epochs, lr=lr, models_path='models/',
                               ModelClass=Models.MF1LSTM, padding_value=padding_value, do_test=False, 
                               log_comet_ml=False, is_custom=True)

In [None]:
next(model.parameters())

In [None]:
next(model.embed_layers.parameters())

#### Testing the model

In [None]:
output, metrics = du.deep_learning.model_inference(model, dataloader=val_dataloader, 
                                                   metrics=['loss', 'accuracy', 'AUC'],
                                                   seq_len_dict=seq_len_dict, padding_value=padding_value, 
                                                   output_rounded=False, set_name='test',
                                                   is_custom=True, 
                                                   cols_to_remove=[du.search_explore.find_col_idx(dmy_norm_df, feature)
                                                                   for feature in ['subject_id', 'ts']])
output

In [None]:
metrics

#### Hyperparameter optimization

In [None]:
if do_hyperparam_optim:
    val_loss_min, exp_name_min = du.machine_learning.optimize_hyperparameters(Models.MF1LSTM, df=dmy_norm_df, 
                                                                              config_name='TLSTM_hyperparameter_optimization_config.yaml', 
                                                                              comet_ml_api_key=comet_ml_api_key,
                                                                              comet_ml_project_name=comet_ml_project_name, 
                                                                              comet_ml_workspace=comet_ml_workspace, 
                                                                              n_inputs=n_inputs-4, id_column='subject_id',  
                                                                              label_column='label', inst_column='ts',
                                                                              n_outputs=1, model_type='multivariate_rnn',
                                                                              is_custom=True, models_path='models/', array_param=None,
                                                                              config_path='notebooks/sandbox/', var_seq=True, 
                                                                              clip_value=0.5, padding_value=padding_value, 
                                                                              batch_size=batch_size, n_epochs=n_epochs,
                                                                              lr=lr, test_train_ratio=0, validation_ratio=0.25,
                                                                              comet_ml_save_model=True, embed_features=embed_features)

In [None]:
if do_hyperparam_optim:
    exp_name_min

### MF2-LSTM

Implementation of the [_Predicting healthcare trajectories from medical records: A deep learning approach_](https://doi.org/10.1016/j.jbi.2017.04.001) paper, parametric time version.

#### Creating the model

Model parameters:

In [None]:
n_ids = dmy_norm_df.subject_id.nunique()      # Total number of sequences
n_inputs = len(dmy_norm_df.columns)           # Number of input features
n_hidden = 10                                 # Number of hidden units
n_outputs = 1                                 # Number of outputs
n_rnn_layers = 4                              # Number of TLSTM layers
p_dropout = 0.2                               # Probability of dropout
embed_features = [du.search_explore.find_col_idx(dmy_norm_df, col) for col in ohe_columns] # Indeces fo the features to be emebedded
embed_features.sort()
embedding_dim = 2                             # Number of outputs of the embedding layr
# delta_ts_col = du.search_explore.find_col_idx(dmy_norm_df, 'delta_ts')   # Number of the delta_ts column
elapsed_time = 'small'                                                   # Indicates if the elapsed time between events is small or long; influences how to discount elapsed time

In [None]:
n_inputs

In [None]:
dmy_norm_df.columns

In [None]:
embed_features

Instantiating the model:

In [None]:
model = Models.MF2LSTM(n_inputs-4, n_hidden, n_outputs, n_rnn_layers, p_dropout,
                       embed_features=embed_features, embedding_dim=embedding_dim, 
                       elapsed_time=elapsed_time)
model

In [None]:
model.rnn_layers[0].cell.input_size

In [None]:
model.rnn_layers[0].cell.hidden_size

In [None]:
model.rnn_layers[0].cell.weight_ih.shape

In [None]:
model.rnn_layers[0].cell.delta_ts_col

In [None]:
model.rnn_layers[1].cell.delta_ts_col

#### Training the model

In [None]:
next(model.parameters())

In [None]:
next(model.embed_layers.parameters())

In [None]:
model = du.deep_learning.train(model, train_dataloader, val_dataloader, seq_len_dict=seq_len_dict,
                               batch_size=batch_size, n_epochs=n_epochs, lr=lr, models_path='models/',
                               ModelClass=Models.MF2LSTM, padding_value=padding_value, do_test=False,
                               log_comet_ml=False, is_custom=True)

In [None]:
next(model.parameters())

In [None]:
next(model.embed_layers.parameters())

#### Testing the model

In [None]:
output, metrics = du.deep_learning.model_inference(model, dataloader=val_dataloader, 
                                                   metrics=['loss', 'accuracy', 'AUC'],
                                                   seq_len_dict=seq_len_dict, padding_value=padding_value, 
                                                   output_rounded=False, set_name='test',
                                                   is_custom=True,
                                                   cols_to_remove=[du.search_explore.find_col_idx(dmy_norm_df, feature)
                                                                   for feature in ['subject_id', 'ts']])
output

In [None]:
metrics

#### Hyperparameter optimization

In [None]:
if do_hyperparam_optim:
    val_loss_min, exp_name_min = du.machine_learning.optimize_hyperparameters(Models.MF2LSTM, df=dmy_norm_df, 
                                                                              config_name='TLSTM_hyperparameter_optimization_config.yaml', 
                                                                              comet_ml_api_key=comet_ml_api_key,
                                                                              comet_ml_project_name=comet_ml_project_name, 
                                                                              comet_ml_workspace=comet_ml_workspace, 
                                                                              n_inputs=n_inputs-4, id_column='subject_id',  
                                                                              label_column='label', inst_column='ts',
                                                                              n_outputs=1, model_type='multivariate_rnn',
                                                                              is_custom=True, models_path='models/', array_param=None,
                                                                              config_path='notebooks/sandbox/', var_seq=True, 
                                                                              clip_value=0.5, padding_value=padding_value, 
                                                                              batch_size=batch_size, n_epochs=n_epochs,
                                                                              lr=lr, test_train_ratio=0, validation_ratio=0.25,
                                                                              comet_ml_save_model=True, embed_features=embed_features)

In [None]:
if do_hyperparam_optim:
    exp_name_min

#### Interpreting the model

In [None]:
interpreter = ModelInterpreter(model, dmy_norm_df, model_type='multivariate_rnn',
                               id_column=0, inst_column=1, fast_calc=True, SHAP_bkgnd_samples=10000,
                               random_seed=du.random_seed, padding_value=padding_value, is_custom=True)

In [None]:
all_features = np.concatenate([train_features, val_features])
all_features

In [None]:
all_labels = np.concatenate([train_labels, val_labels])
all_labels

In [None]:
idx = 0

In [None]:
all_features[idx]

In [None]:
_ = interpreter.interpret_model(test_data=all_features, 
                                test_labels=all_labels, instance_importance=True, 
                                feature_importance='shap')

In [None]:
interpreter.feat_scores

In [None]:
interpreter.feat_scores.shape

In [None]:
interpreter.test_data[:, :, 2:].shape

In [None]:
column_names = list(dmy_df.columns)
column_names

In [None]:
features_names = column_names.copy()
features_names.remove('subject_id')
features_names.remove('ts')
features_names.remove('label')
features_names

In [None]:
shap_column_names = [f'{feature}_shap' for feature in features_names]
shap_column_names

In [None]:
interpreter.test_data.numpy().shape

In [None]:
interpreter.test_labels.unsqueeze(2).numpy().shape

In [None]:
interpreter.feat_scores.shape

In [None]:
data_n_shap = np.concatenate([interpreter.test_data.numpy(), interpreter.test_labels.unsqueeze(2).numpy(), interpreter.feat_scores], axis=2)
data_n_shap

In [None]:
data_n_shap.shape

In [None]:
data_n_shap.reshape(-1, 19)

In [None]:
data_n_shap_columns = ['subject_id', 'ts']+features_names+['label']+shap_column_names
data_n_shap_columns

In [None]:
[feature for feature in data_n_shap_columns if feature.endswith('_shap')]

In [None]:
data_n_shap_df = pd.DataFrame(data=data_n_shap.reshape(-1, 19), columns=data_n_shap_columns)
data_n_shap_df

In [None]:
data_n_shap_df.to_csv('notebooks/sandbox/dummy_data/data_n_shap_df.csv')

In [None]:
du.visualization.shap_summary_plot(interpreter.feat_scores, features_names, max_display=3,
                                   background_color='#282828',
                                   output_type='plotly',
                                   font_family='Roboto', font_size=14,
                                   font_color='#ADAFAE')

In [None]:
interpreter.feat_scores.sum(axis=2)

In [None]:
interpreter.explainer.expected_value[0]

In [None]:
interpreter.feat_scores.sum(axis=2) + interpreter.explainer.expected_value[0]

In [None]:
idx = 0

In [None]:
interpreter.feat_scores.sum(axis=2)[idx] + interpreter.explainer.expected_value[0]

In [None]:
interpreter.test_data[idx]

In [None]:
model(interpreter.test_data[idx, :, 2:].unsqueeze(0))

In [None]:
interpreter.test_data[idx]

In [None]:
interpreter.explainer.subject_ids

In [None]:
interpreter.feat_names

In [None]:
interpreter.feat_scores.reshape(-1, model.n_inputs+1).shape

In [None]:
val_features[:, :4, 2:].numpy().reshape(-1, model.n_inputs+1).shape

In [None]:
# Summarize the effects of all the features
shap.summary_plot(interpreter.feat_scores.reshape(-1, model.n_inputs+1), 
                  features=interpreter.test_data[:, :4, 2:].numpy().reshape(-1, model.n_inputs+1), 
                  feature_names=interpreter.feat_names, plot_type='bar')

In [None]:
# [TODO] Do the same bar plot as above but in plotly

In [None]:
np.abs(interpreter.feat_scores).reshape(-1, interpreter.feat_scores.shape[-1]).shape

In [None]:
mean_abs_shap = np.mean(np.abs(interpreter.feat_scores).reshape(-1, interpreter.feat_scores.shape[-1]), axis=0)
mean_abs_shap

In [None]:
sorted_idx = np.argsort(mean_abs_shap)
sorted_idx

In [None]:
interpreter.feat_names

In [None]:
[interpreter.feat_names[idx] for idx in sorted_idx]

In [None]:
mean_abs_shap[sorted_idx]

In [None]:
figure={
    'data': [dict(
        type='bar',
        x=mean_abs_shap[sorted_idx],
        y=[interpreter.feat_names[idx] for idx in sorted_idx],
        orientation='h'
    )],
    'layout': dict(
        margin=dict(l=0, r=0, t=0, b=0, pad=0),
        xaxis_title='mean(|SHAP value|) (average impact on model output magnitude)',
        font=dict(
                family='Roboto',
                size=14,
                color='black'
            )
    )
}

In [None]:
go.Figure(figure)

In [None]:
du.visualization.shap_summary_plot(interpreter.feat_scores, interpreter.feat_names)

In [None]:
# # Choosing which example to use
# subject_id = 125
# patient = utils.find_subject_idx(test_features_denorm, subject_id=subject_id)
# patient

In [None]:
# # True sequence length of the current patient's data
# seq_len = seq_len_dict[test_features_denorm[patient, 0, 0].item()]
# # Plot the explanation of the predictions for one patient
# shap.force_plot(interpreter.explainer.expected_value[0], 
#                 interpreter.feat_scores[patient, :seq_len], 
#                 features=test_features_denorm[patient, :seq_len, 2:].numpy(), 
#                 feature_names=ALS_cols)

In [None]:
# # Init the JS visualization code
# shap.initjs()

# # Choosing which timestamp to use
# ts = 9

# # Plot the explanation of one prediction
# shap.force_plot(interpreter.explainer.expected_value[0], 
#                 interpreter.feat_scores[patient][ts], 
#                 features=test_features_denorm[patient, ts, 2:].numpy(), 
#                 feature_names=ALS_cols)

In [None]:
pred = 0
sample = 0

In [None]:
[f'{feature}={val:.2e}' for (feature, val) in zip(interpreter.feat_names, interpreter.test_data[pred, sample, 2:])]

In [None]:
interpreter.explainer.expected_value[0]

In [None]:
interpreter.feat_scores.shape

In [None]:
interpreter.feat_scores[pred, sample].shape

In [None]:
len(interpreter.feat_scores[pred, sample].shape)

In [None]:
interpreter.feat_scores[pred, sample]

In [None]:
model(interpreter.test_data[pred, sample, 2:].unsqueeze(0).unsqueeze(0))

In [None]:
np.sum(interpreter.feat_scores[pred, sample]) + interpreter.explainer.expected_value[0]

In [None]:
interpreter.feat_names

In [None]:
interpreter.test_data[pred, sample, :].numpy()

In [None]:
shap.waterfall_plot(interpreter.explainer.expected_value[0], 
                    interpreter.feat_scores[pred, sample],
                    features=interpreter.test_data[pred, sample, 2:].numpy(), 
                    feature_names=interpreter.feat_names)

In [None]:
shap.waterfall_plot(interpreter.explainer.expected_value[0], 
                    interpreter.feat_scores[pred, sample],
                    features=interpreter.test_data[pred, sample, 2:].numpy(), 
                    feature_names=interpreter.feat_names,
                    max_display=2)

In [None]:
# du.visualization.shap_waterfall_plot(interpreter.explainer.expected_value[0], interpreter.feat_scores[pred, sample],
du.visualization.shap_waterfall_plot(0, interpreter.feat_scores[pred, sample],
                                     interpreter.test_data[pred, sample, 2:], interpreter.feat_names,
                                     max_display=2)

In [None]:
fig = go.Figure()

fig.add_trace(go.Waterfall(
    y = [["initial", "q1", "q2", "q3", "total", "q1", "q2", "q3", "total"]],
    measure = ["absolute", "relative", "relative", "relative", "total", "relative", "relative", "relative", "total"],
    x = [1, 2, 3, -1, None, 1, 2, -4, None],
    base = 1000,
    orientation='h'
))

fig.add_trace(go.Waterfall(
    y = [["2016", "2017", "2017", "2017", "2017", "2018", "2018", "2018", "2018"],
        ["initial", "q1", "q2", "q3", "total", "q1", "q2", "q3", "total"]],
    measure = ["absolute", "relative", "relative", "relative", "total", "relative", "relative", "relative", "total"],
    x = [1.1, 2.2, 3.3, -1.1, None, 1.1, 2.2, -4.4, None],
    base = 1000,
    orientation='h'
))

fig.update_layout(
    waterfallgroupgap = 0.5,
)

fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Waterfall(
    y = ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
#     measure = ["absolute", "relative", "relative", "relative", "total", "relative", "relative", "relative", "total"],
    x = [1, 2, 3, -1, None, 1, 2, -4, None],
    base = 1000,
    orientation='h'
))

fig.show()

In [None]:
interpreter.feat_scores[pred, sample]

In [None]:
interpreter.feat_names

In [None]:
fig = go.Figure()

fig.add_trace(go.Waterfall(
    y = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'],
    x = [1, 2, 1, -2, -1, 3, -4, 1],
    base = 100,
    orientation='h'
))

fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Waterfall(
    y = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'],
    x = [-1, -2, -1, 2, 1, -3, 4, -1],
    base = 100,
    orientation='h'
))

fig.show()

### Deep Care with parametric time

Implementation of the [_Predicting healthcare trajectories from medical records: A deep learning approach_](https://doi.org/10.1016/j.jbi.2017.04.001) paper, full parametric time version.