# Likelihood encoding benchmark

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [1]:
import sys
main_path = '../../'
sys.path.append(main_path + 'code/auto_ml')
sys.path.append(main_path + 'code/processing')
sys.path.append(main_path + 'code/models')
sys.path.append(main_path + 'data')

In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv(main_path + 'data/mimic/final_df.csv')

In [3]:
X = data.drop(['DIED'], axis=1)
y = pd.DataFrame(data['DIED'])

In [4]:
# Normalization available = standard.
normalization = ['standard']
# Encoding available = label, one-hot, likelihood.
encoding = ['none', 'label', 'one-hot', 'likelihood']

In [6]:
from processing import processing

def process_and_split(X, y, normalization, encoding):
    print('Processing with', nrm, 'normalization and', ecd, 'encoding...')
    X_preprocessed = processing(X, normalization, encoding).values
    y_preprocessed = processing(y, 'none', 'label').values.reshape(-1, 1)
    
    print('X shape: ', X_preprocessed.shape)
    print('y shape: ', y_preprocessed.shape)
    
    X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y_preprocessed, train_size=0.7, test_size=0.3)
    
    return X_train, X_test, y_train, y_test

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from itertools import product
import time

for nrm, ecd in product(normalization, encoding):
    X_train, X_test, y_train, y_test = process_and_split(X, y, nrm, ecd)
    y_train, y_test = np.ravel(y_train), np.ravel(y_test)
    
    reg = LogisticRegression()
    
    print('Training model...')

    t1 = time.time()
    reg.fit(X_train, y_train)
    t2 = time.time()
    
    print('Model trained in ', t2-t1, 's.')

    print('Train accuracy: ', accuracy_score(np.round(reg.predict(X_train)), y_train))
    print('Test accuracy: ', accuracy_score(np.round(reg.predict(X_test)), y_test))
    print('\n')

Processing with standard normalization and none encoding...
X shape:  (26927, 1062)
y shape:  (26927, 1)
Training model...
Model trained in  1.310999870300293 s.
Train accuracy:  0.9394100169779287
Test accuracy:  0.9249907166728556


Processing with standard normalization and label encoding...
X shape:  (26927, 1068)
y shape:  (26927, 1)
Training model...
Model trained in  1.8622801303863525 s.
Train accuracy:  0.9386672325976231
Test accuracy:  0.9308082683500434


Processing with standard normalization and one-hot encoding...
X shape:  (26927, 1211)
y shape:  (26927, 1)
Training model...
Model trained in  3.0605862140655518 s.
Train accuracy:  0.9380305602716469
Test accuracy:  0.935388043074638


Processing with standard normalization and likelihood encoding...
X shape:  (26927, 1068)
y shape:  (26927, 1)
Training model...
Model trained in  2.0661611557006836 s.
Train accuracy:  0.9391447368421053
Test accuracy:  0.9285802698353757




In [8]:
import matplotlib.pyplot as plt
import run_model_torch
import time
from itertools import product
from sklearn.model_selection import train_test_split

fig1, ax1 = plt.subplots(1, 1, figsize=(12, 8))
fig2, ax2 = plt.subplots(1, 2, figsize=(12, 6), sharey=True)

for nrm, ecd in product(normalization, encoding):
    X_train, X_test, y_train, y_test = process_and_split(X, y, nrm, ecd)
    
    print('Training model...')
    
    t1 = time.time()
    losses, train_acc, test_acc = run_model_torch.training(X_train, y_train, X_test, y_test)
    t2 = time.time()
    
    print('Training finished in', t2-t1, 's. \n')
    
    ax1.plot(losses, 'o-', label=nrm + ' ' +  ecd)
    ax2[0].plot(train_acc, 'o-', label=nrm + ' ' +  ecd)
    ax2[1].plot(test_acc, 'o-', label=nrm + ' ' +  ecd)
    
ax1.legend()
ax1.set_title('Loss')
ax2[0].legend()
ax2[0].set_title('Training Accuracy')
ax2[1].legend()
ax2[1].set_title('Testing Accuracy')
plt.show()

Processing with standard normalization and none encoding...
X shape:  (26927, 1062)
y shape:  (26927, 1)
Training model...
Training finished in 13.099853038787842 s. 

Processing with standard normalization and label encoding...
X shape:  (26927, 1068)
y shape:  (26927, 1)
Training model...
Training finished in 11.800348997116089 s. 

Processing with standard normalization and one-hot encoding...
X shape:  (26927, 1211)
y shape:  (26927, 1)
Training model...
Training finished in 12.66047215461731 s. 

Processing with standard normalization and likelihood encoding...
X shape:  (26927, 1068)
y shape:  (26927, 1)
Training model...
Training finished in 10.98168683052063 s. 



<Figure size 1200x800 with 1 Axes>

<Figure size 1200x600 with 2 Axes>