In [1]:
# Stratagy and Reference from "Interpreting clinical latent representations using 
# autoencoders and probabilistic models"
import os
import csv
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from keras.layers import Input, Dense
from keras.models import Model

In [2]:
# reading in data to dataframe, download data from https://archive.ics.uci.edu/ml/datasets/HCV+data
data = pd.read_csv('hcvdat0.csv')
data = data.drop(data.columns[[0,2,3]], axis=1) # dropping patient id, age, and sex
data.head()

Unnamed: 0,Category,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,0=Blood Donor,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,0=Blood Donor,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,0=Blood Donor,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,0=Blood Donor,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,0=Blood Donor,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [3]:
# prepping dataframe for autoencoder, specifically provide meaningful labels for each record
def modifyCategory(inp): # processing for label
    if (inp[0] == '0'):
        return 0
    else:
        return 1

data.Category = data.Category.apply(modifyCategory)
data.head()

Unnamed: 0,Category,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,0,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,0,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,0,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,0,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,0,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [4]:
# autoencoder preprocessing
y = data['Category']
x = data.drop(data.columns[0], axis=1)
x = minmax_scale(x, axis = 0)
X_train, X_test, Y_train, Y_test = train_test_split(x, y, train_size=0.75) # generates test train split of data
print(X_train.shape)
print(X_test.shape)

(461, 10)
(154, 10)


In [5]:
# autoencoder logic - reference: https://blog.keras.io/building-autoencoders-in-keras.html
X_train[np.isnan(X_train)] = 0.001 # remove nan
X_test[np.isnan(X_test)] = 0.001 # remove nan

encoding_dim = 3
inpt = Input(shape=(10,)) # implementing simple one layer autoencoder
encoded = Dense(encoding_dim, activation='relu')(inpt)
decoded = Dense(10, activation='sigmoid')(encoded)
autoencoder = Model(inpt, decoded)
encoder = Model(inpt, encoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
autoencoder.fit(X_train, X_train,
                epochs=50,
                shuffle=True,
                validation_data=(X_test, X_test))
encoded_x_train = encoder.predict(X_train)
encoded_x_test = encoder.predict(X_test)
print(encoded_x_train[0:2])
print(encoded_x_test[0:2])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[[1.5282476 1.6953152 1.6068524]
 [1.7148554 1.5981733 1.6697099]]
[[1.585567  1.535839  1.5781106]
 [1.6165617 1.7599298 1.5529283]]


In [28]:
# interpretting autoencoder representation using GMM
gm = GaussianMixture(n_components=2).fit(encoded_x_train)
preds = gm.predict(encoded_x_test)
if (sum(preds) > len(preds)/2):
    preds = 1 - preds
preds = preds.tolist()
Y_test = list(Y_test)
count = 0
for i in range(len(preds)):
    if (preds[i] == Y_test[i]):
        count += 1
print('Accuracy:', count/len(preds))

Accuracy: 0.948051948051948


In [32]:
from sklearn.metrics import precision_score, accuracy_score

print(accuracy_score(Y_test, preds))
print(precision_score(Y_test, preds, average=None))

0.948051948051948
[0.97058824 0.77777778]
