In [1]:
import numpy as np
import pandas as pd
import math

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

from imblearn.under_sampling import RandomUnderSampler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [2]:
filepath = "/home/abhijit/Downloads/dataset.csv"
df = pd.read_csv(filepath)

In [3]:
print(df.head())

   Unnamed: 0   hadm_id            starttime  maximum_Glucose  \
0           0  20000094  2150-03-02 16:03:00            383.0   
1           1  20000147  2121-08-30 22:16:00            160.0   
2           2  20000808  2180-01-17 20:58:00            109.0   
3           3  20001305  2178-03-26 12:23:00             96.0   
4           4  20001494  2125-10-26 18:30:00            147.0   

   maximum_Intubated  maximum_Lactate  maximum_Sodium  maximum_Temperature  \
0                NaN              6.9           127.0                 36.8   
1                NaN              1.9           134.0                  NaN   
2                NaN              NaN           138.0                  NaN   
3                NaN              2.0           140.0                  NaN   
4                NaN              NaN           139.0                  NaN   

   minimum_Glucose  minimum_Intubated  ...  diabetes_with_cc  renal_disease  \
0            383.0                NaN  ...                 0 

In [4]:
# Checking which columns have NaNs
print(df.columns[df.isna().any()].tolist())

['maximum_Glucose', 'maximum_Intubated', 'maximum_Lactate', 'maximum_Sodium', 'maximum_Temperature', 'minimum_Glucose', 'minimum_Intubated', 'minimum_Lactate', 'minimum_Sodium', 'minimum_Temperature']


In [5]:
for col in df.columns[df.isna().any()].tolist():
    print(df[col].unique())

[ 383.  160.  109.   96.  147.  175.  190.  135.   nan  130.  115.  177.
  162.  153.  166.  161.  155.   84.  158.  149.  168.  170.  142.  120.
  184.  342.  108.  118.  163.   95.  178.  129.  229.  151.   99.  140.
  189.   97.  224.  148.  202.   89.  134.  186.  165.  119.  141.  215.
   98.  336.  227.  133.  132.  390.   85.  154.  127.  124.  116.  152.
  114.   62.  156.  104.  220.  139.  110.  125.  193.  425.  181.  206.
  122.   87.  216.  123.  228.  105.  176.  183.  486.  100.  164.  185.
  131.   76.  113.   83.  395.  107.  144.   73.  143.  173.  347.  111.
  300.  157.  150.  212.  312.   93.  207.  196.  172.  117.  188.   72.
  238.  159.  167.  199.  174.  145.  187.  182.  121.   37.   70.  233.
  253.  128.  179.  112.  296.  239.  200.  246.  146.   94.  340.  245.
   91.  263.  137.  210.  180.  203.  250.  270.  234.  101.  103.  272.
  230.  232.  211.  126.  136.  204.  169.  209.  208.  251.   69.  192.
  231.  276.  288.  235.  283.   92.  280.  299.  1

In [6]:
values = {"maximum_Glucose": 110, "maximum_Intubated": 'No', "maximum_Lactate": 1, "maximum_Sodium": 140, "maximum_Temperature": 37, "minimum_Glucose": 110, "minimum_Intubated": 'No', "minimum_Lactate": 1, "minimum_Sodium": 140, "minimum_Temperature": 37}
df = df.fillna(value=values)

df = df.drop(['maximum_Intubated', 'minimum_Intubated'], axis=1)
print(df.shape)

(39338, 29)


In [7]:
# Checking which columns have NaNs
print(df.columns[df.isna().any()].tolist())

[]


In [8]:
# To check if there is any feature with only one unique value
for col in df:
    print(len(df[col].unique()))

39338
39338
39244
599
238
73
93
411
204
70
105
32555
2
2
2
2
2
2
2
2
2
2
2
2
2
2
73
1487
2


In [9]:
# Printing feature names
print(df.columns)

Index(['Unnamed: 0', 'hadm_id', 'starttime', 'maximum_Glucose',
       'maximum_Lactate', 'maximum_Sodium', 'maximum_Temperature',
       'minimum_Glucose', 'minimum_Lactate', 'minimum_Sodium',
       'minimum_Temperature', 'subject_id', 'peripheral_vascular_disease',
       'cerebrovascular_disease', 'dementia', 'chronic_pulmonary_disease',
       'rheumatic_disease', 'mild_liver_disease', 'diabetes_without_cc',
       'diabetes_with_cc', 'renal_disease', 'malignant_cancer',
       'severe_liver_disease', 'metastatic_solid_tumor', 'aids', 'gender',
       'age', 'patientweight', 'clabsi'],
      dtype='object')


In [10]:
# Moving subject_id to first column
first_column = df.pop('subject_id')
df.insert(0, 'subject_id', first_column)
print(df.head())

to_be_dropped_later = ['subject_id', 'Unnamed: 0', 'hadm_id', 'starttime']

   subject_id  Unnamed: 0   hadm_id            starttime  maximum_Glucose  \
0    14046553           0  20000094  2150-03-02 16:03:00            383.0   
1    14990224           1  20000147  2121-08-30 22:16:00            160.0   
2    16788749           2  20000808  2180-01-17 20:58:00            109.0   
3    16003661           3  20001305  2178-03-26 12:23:00             96.0   
4    15975141           4  20001494  2125-10-26 18:30:00            147.0   

   maximum_Lactate  maximum_Sodium  maximum_Temperature  minimum_Glucose  \
0              6.9           127.0                 36.8            383.0   
1              1.9           134.0                 37.0            100.0   
2              1.0           138.0                 37.0            109.0   
3              2.0           140.0                 37.0             96.0   
4              1.0           139.0                 37.0            147.0   

   minimum_Lactate  ...  diabetes_with_cc  renal_disease  malignant_cancer  \
0 

In [11]:
# Separate class 0 and class 1
df_c0 = df.loc[df['clabsi'] == 0]
df_c1 = df.loc[df['clabsi'] == 1]

print(df_c0.shape)
print(df_c1.shape)

(38980, 29)
(358, 29)


In [12]:
print(df_c0['clabsi'].unique())
print(df_c1['clabsi'].unique())

[0]
[1]


In [13]:
# Splitting 80% of data
df_train_0 = df_c0[:math.ceil(0.8*len(df_c0))]
df_val_0 = df_c0[math.ceil(0.8*len(df_c0)):]

print(df_train_0.shape)
print(df_val_0.shape)


df_train_1 = df_c1[:math.ceil(0.8*len(df_c1))]
df_val_1 = df_c1[math.ceil(0.8*len(df_c1)):]

print(df_train_1.shape)
print(df_val_1.shape)

(31184, 29)
(7796, 29)
(287, 29)
(71, 29)


In [14]:
# Checking if any subject ids are in both training and validation set
print(df_train_0.tail())
print(df_val_0.head())

       subject_id  Unnamed: 0   hadm_id            starttime  maximum_Glucose  \
31462    16502195       31462  27986053  2172-09-08 23:22:00             83.0   
31463    10996599       31463  27986337  2186-10-20 15:11:00            160.0   
31464    17340686       31464  27986427  2206-04-09 02:00:00            154.0   
31465    19025111       31465  27986791  2127-06-16 20:45:00            110.0   
31466    17431640       31466  27986910  2116-12-30 12:18:00            171.0   

       maximum_Lactate  maximum_Sodium  maximum_Temperature  minimum_Glucose  \
31462              0.9           125.0                 37.0             63.0   
31463              1.0           143.0                 37.0            142.0   
31464              4.7           134.0                 36.7            122.0   
31465              2.0           154.0                 37.0            110.0   
31466              2.2           143.0                 37.0            133.0   

       minimum_Lactate  ...  dia

In [15]:
# Checking if any subject ids are in both training and validation set
print(df_train_1.tail())
print(df_val_1.head())

       subject_id  Unnamed: 0   hadm_id            starttime  maximum_Glucose  \
31409    19509298       31409  27971630  2194-11-29 20:00:00            456.0   
31505    14236258       31505  27996889  2183-04-23 16:31:00            121.0   
31546    14003453       31546  28007037  2142-04-06 21:11:00            389.0   
31723    12212162       31723  28060124  2140-05-09 16:14:00            110.0   
32011    11365767       32011  28133020  2154-07-25 14:28:00            112.0   

       maximum_Lactate  maximum_Sodium  maximum_Temperature  minimum_Glucose  \
31409              1.5           154.0                 37.0            231.0   
31505              1.0           140.0                 37.0            121.0   
31546              4.2           142.0                 39.2            150.0   
31723              1.6           141.0                 37.0            110.0   
32011              1.3           142.0                 39.2            112.0   

       minimum_Lactate  ...  dia

In [16]:
# Merging the two classes in training and validation sets

df_train_rus = pd.concat([df_train_0, df_train_1])
print(df_train_rus.shape)

df_val_rus = pd.concat([df_val_0, df_val_1])
print(df_val_rus.shape)

(31471, 29)
(7867, 29)


In [17]:
# Shuffling the rows

df_train = df_train_rus.sample(frac=1, random_state=0).reset_index(drop=True)
df_val = df_val_rus.sample(frac=1, random_state=0).reset_index(drop=True)

In [18]:
# Deleting unnecessary columns

df_train = df_train.drop(to_be_dropped_later, axis=1)
print(df_train.shape)

df_val = df_val.drop(to_be_dropped_later, axis=1)
print(df_val.shape)

(31471, 25)
(7867, 25)


In [19]:
# Extracting labels

y_train = df_train['clabsi']
df_train = df_train.drop(['clabsi'], axis=1)
print(y_train.shape)
print(df_train.shape)

y_val = df_val['clabsi']
df_val = df_val.drop(['clabsi'], axis=1)
print(y_val.shape)
print(df_val.shape)

(31471,)
(31471, 24)
(7867,)
(7867, 24)


In [20]:
# Define undersampling strategy
undersample = RandomUnderSampler(sampling_strategy=0.5)

# fit and apply the transform
df_train, y_train = undersample.fit_resample(df_train, y_train)

print(df_train.shape)
print(y_train.shape)

(861, 24)
(861,)


In [21]:
print(sum(y_train))

287


In [22]:
# Printing feature names
print(df_train.columns)

for col in df_train:
    print(len(df_train[col].unique()))

Index(['maximum_Glucose', 'maximum_Lactate', 'maximum_Sodium',
       'maximum_Temperature', 'minimum_Glucose', 'minimum_Lactate',
       'minimum_Sodium', 'minimum_Temperature', 'peripheral_vascular_disease',
       'cerebrovascular_disease', 'dementia', 'chronic_pulmonary_disease',
       'rheumatic_disease', 'mild_liver_disease', 'diabetes_without_cc',
       'diabetes_with_cc', 'renal_disease', 'malignant_cancer',
       'severe_liver_disease', 'metastatic_solid_tumor', 'aids', 'gender',
       'age', 'patientweight'],
      dtype='object')
212
85
41
47
165
52
38
52
2
2
2
2
2
2
2
2
2
2
2
2
2
2
73
456


In [23]:
# Normalizing only certain columns

cols_to_norm = ['maximum_Glucose','maximum_Lactate','maximum_Sodium','maximum_Temperature','minimum_Glucose','minimum_Lactate','minimum_Sodium','minimum_Temperature','age','patientweight']

scaler = StandardScaler()

df_train[cols_to_norm] = scaler.fit_transform(df_train[cols_to_norm])
df_val[cols_to_norm] = scaler.transform(df_val[cols_to_norm])

In [24]:
# Converting gender into one-hot encoding

one_hot = 'gender'

enc = preprocessing.LabelEncoder()

df_train[one_hot] = enc.fit_transform(df_train[one_hot])
df_val[one_hot] = enc.transform(df_val[one_hot])

In [25]:
for col in df_train:
    print(type(df_train[col][0]))

<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.float64'>
<class 'numpy.float64'>


In [26]:
#Split dataset (or directly store the split dataset in the right variables)

X_train = df_train
X_test = df_val
y_train = np.array(y_train)
y_test = np.array(y_val)

In [27]:
# Logistic Regression

#Create an instance of the model
clf = LogisticRegression(random_state=0)

#Fit the Model to the Data
clf.fit(X_train, y_train)

# Evaluation metrics

# Predicts labels for test data
y_pred = clf.predict(X_test)

#Predicts probability, useful also in log loss calculations (and ROC AUC).
y_pred_prob = clf.predict_proba(X_test)[:, 1]

# Confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, y_pred)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, y_pred)
print('F1 score: %f' % f1)

# ROC AUC
auc = roc_auc_score(y_test, y_pred_prob)
print('ROC AUC: %f' % auc)

[[7056  740]
 [  58   13]]
Accuracy: 0.898564
Precision: 0.017264
Recall: 0.183099
F1 score: 0.031553
ROC AUC: 0.600129


In [28]:
from imblearn.over_sampling import RandomOverSampler# Decision Tree

#Create an instance of the model
clf = DecisionTreeClassifier(random_state=0)

#Fit the Model to the Data
clf.fit(X_train, y_train)

# Evaluation metrics

# Predicts labels for test data
y_pred = clf.predict(X_test)

#Predicts probability, useful also in log loss calculations (and ROC AUC).
y_pred_prob = clf.predict_proba(X_test)[:, 1]

# Confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, y_pred)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, y_pred)
print('F1 score: %f' % f1)

# ROC AUC
auc = roc_auc_score(y_test, y_pred_prob)
print('ROC AUC: %f' % auc)

[[5361 2435]
 [  43   28]]
Accuracy: 0.685013
Precision: 0.011368
Recall: 0.394366
F1 score: 0.022099
ROC AUC: 0.541013


In [29]:
# Random Forest

#Create an instance of the model
clf = RandomForestClassifier(max_depth=2, random_state=0)

#Fit the Model to the Data
clf.fit(X_train, y_train)

# Evaluation metrics

# Predicts labels for test data
y_pred = clf.predict(X_test)

#Predicts probability, useful also in log loss calculations (and ROC AUC).
y_pred_prob = clf.predict_proba(X_test)[:, 1]

# Confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, y_pred)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, y_pred)
print('F1 score: %f' % f1)

# ROC AUC
auc = roc_auc_score(y_test, y_pred_prob)
print('ROC AUC: %f' % auc)

[[7779   17]
 [  70    1]]
Accuracy: 0.988941
Precision: 0.055556
Recall: 0.014085
F1 score: 0.022472
ROC AUC: 0.611713


In [30]:
# Extra Trees Classifier

#Create an instance of the model
clf = ExtraTreesClassifier(n_estimators=100, random_state=0)

#Fit the Model to the Data
clf.fit(X_train, y_train)

# Evaluation metrics

# Predicts labels for test data
y_pred = clf.predict(X_test)

#Predicts probability, useful also in log loss calculations (and ROC AUC).
y_pred_prob = clf.predict_proba(X_test)[:, 1]

# Confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, y_pred)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, y_pred)
print('F1 score: %f' % f1)

# ROC AUC
auc = roc_auc_score(y_test, y_pred_prob)
print('ROC AUC: %f' % auc)

[[6844  952]
 [  49   22]]
Accuracy: 0.872760
Precision: 0.022587
Recall: 0.309859
F1 score: 0.042105
ROC AUC: 0.667497


In [31]:
# Gradient Boosting

#Create an instance of the model
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

#Fit the Model to the Data
clf.fit(X_train, y_train)

# Evaluation metrics

# Predicts labels for test data
y_pred = clf.predict(X_test)

#Predicts probability, useful also in log loss calculations (and ROC AUC).
y_pred_prob = clf.predict_proba(X_test)[:, 1]

# Confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, y_pred)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, y_pred)
print('F1 score: %f' % f1)

# ROC AUC
auc = roc_auc_score(y_test, y_pred_prob)
print('ROC AUC: %f' % auc)

[[6525 1271]
 [  50   21]]
Accuracy: 0.832083
Precision: 0.016254
Recall: 0.295775
F1 score: 0.030814
ROC AUC: 0.597605


In [32]:
# AdaBoost

#Create an instance of the model
clf = AdaBoostClassifier(n_estimators=100, random_state=0)

#Fit the Model to the Data
clf.fit(X_train, y_train)

# Evaluation metrics

# Predicts labels for test data
y_pred = clf.predict(X_test)

#Predicts probability, useful also in log loss calculations (and ROC AUC).
y_pred_prob = clf.predict_proba(X_test)[:, 1]

# Confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, y_pred)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, y_pred)
print('F1 score: %f' % f1)

# ROC AUC
auc = roc_auc_score(y_test, y_pred_prob)
print('ROC AUC: %f' % auc)

[[6449 1347]
 [  55   16]]
Accuracy: 0.821787
Precision: 0.011739
Recall: 0.225352
F1 score: 0.022315
ROC AUC: 0.591062


In [33]:
# XGBoost

#Create an instance of the model
clf = XGBClassifier(random_state=0, use_label_encoder=False)

#Fit the Model to the Data
clf.fit(X_train, y_train)

# Evaluation metrics

# Predicts labels for test data
y_pred = clf.predict(X_test)

#Predicts probability, useful also in log loss calculations (and ROC AUC).
y_pred_prob = clf.predict_proba(X_test)[:, 1]

# Confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, y_pred)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, y_pred)
print('F1 score: %f' % f1)

# ROC AUC
auc = roc_auc_score(y_test, y_pred_prob)
print('ROC AUC: %f' % auc)

[[6342 1454]
 [  45   26]]
Accuracy: 0.809457
Precision: 0.017568
Recall: 0.366197
F1 score: 0.033527
ROC AUC: 0.674752
