In [0]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split, KFold
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, precision_recall_curve, roc_curve, auc

from sklearn.linear_model import LogisticRegression
%matplotlib inline



In [0]:


scaler = preprocessing.StandardScaler()  
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

lg_acc = []; lg_f1 = []; lg_pre = []; lg_rec = []; lg_roc_auc = []; lg_prc_auc = []





In [0]:
fold_indexes= range(1, 5) # [1, 2, 3, 4]

for index in fold_indexes:
    val_index = index # eg. if val_index = 1
    train_index = [i  for i in fold_indexes if index != i ] # then train_index = [2, 3, 4]
   
    print('*** Fold', index, '***')
    print('\tVal Index:',val_index)
    print('\tTrain Index:',train_index, '\n')
    
    fold_name = 'fold' + str(val_index) + '.csv'
    print('\t(Val Data: ', fold_name, ')\n')
    df_val = pd.read_csv(fold_name) # Fold_2_df.psv
    df_train = pd.read_csv('fold' + str(train_index[0]) + '.csv')
    for i in train_index[1:]:
        df_train = df_train.append(pd.read_csv('fold' + str(i) + '.csv'))    

#     print('\n\tVal Shape: ', df_val.shape)
#     print('\tTrain Shape: ', df_train.shape)
    
    columns = df_train.columns.values


    
    X_train = df_train.iloc[:,2:-1]
    y_train = df_train['SepsisLabel']
    X_val = df_val.iloc[:,2:-1]
    y_val = df_val['SepsisLabel']
    
    X_train = imp.fit_transform(X_train)
    X_val = imp.transform(X_val)
    
    # for scaling
    X_train = scaler.fit_transform(X_train)  
    X_val = scaler.transform(X_val) 
    
    # LOGISTIC REGRESSION
    log_reg = LogisticRegression(random_state=0, n_jobs=-1, solver='lbfgs')
    log_reg.fit(X_train, y_train)
    y_pred_lg = log_reg.predict(X_val)
    
    print('Prediction Value Counts:')
    print(pd.Series(y_pred_lg).value_counts())
    
    y_prob_lg = log_reg.predict_proba(X_val)[:,1]
    lg_fpr, lg_tpr, _ = roc_curve(y_val, y_prob_lg)
    lg_pre_, lg_rec_, __ = precision_recall_curve(y_val, y_prob_lg)
    
    lg_acc.append(round(accuracy_score(y_val, y_pred_lg), 4))
    lg_f1.append(round(f1_score(y_val, y_pred_lg, average='weighted', labels=np.unique(y_pred_lg)), 4))
    lg_pre.append(round(precision_score(y_val,  y_pred_lg, average='weighted', labels=np.unique(y_pred_lg)), 4))
    lg_rec.append(round(recall_score(y_val, y_pred_lg), 4))
    lg_roc_auc.append(round(auc(lg_fpr, lg_tpr), 4))
    lg_prc_auc.append(round(auc(lg_rec_, lg_pre_), 4))
    print()

# for saving the model
with open('log_reg.pickle', 'wb') as lr_file:
    pickle.dump(log_reg, lr_file)

*** Fold 1 ***
	Val Index: 1
	Train Index: [2, 3, 4] 

	(Val Data:  fold1.csv )

Prediction Value Counts:
0    309535
1      1247
dtype: int64

*** Fold 2 ***
	Val Index: 2
	Train Index: [1, 3, 4] 

	(Val Data:  fold2.csv )

Prediction Value Counts:
0    309354
1       379
dtype: int64

*** Fold 3 ***
	Val Index: 3
	Train Index: [1, 2, 4] 

	(Val Data:  fold3.csv )

Prediction Value Counts:
0    308840
1      1169
dtype: int64

*** Fold 4 ***
	Val Index: 4
	Train Index: [1, 2, 3] 

	(Val Data:  fold4.csv )

Prediction Value Counts:
0    308969
1      1034
dtype: int64



In [0]:

print('LR Accuracy: %.4f' % np.mean(lg_acc))
print('LR F1 score: %.4f' % np.mean(lg_f1))
print('LR Precision: %.4f' % np.mean(lg_pre))
print('LR Recall: %.4f' % np.mean(lg_rec))
print('LR AUC ROC: %.4f' % np.mean(lg_roc_auc))
print('LR AUC PRC: %.4f' % np.mean(lg_prc_auc))



LR Accuracy: 0.9661
LR F1 score: 0.9523
LR Precision: 0.9450
LR Recall: 0.0231
LR AUC ROC: 0.7419
LR AUC PRC: 0.1380


In [0]:

df_test = pd.read_csv('fold5.csv')


X_test = df_test.iloc[:,2:-1]
y_test = df_test['SepsisLabel']

X_test = imp.transform(X_test)  
X_test = scaler.transform(X_test) 

y_test = pd.DataFrame(y_test, index=None, columns=['SepsisLabel'])


y_pred_test_lg = log_reg.predict(X_test)
y_prob_test_lg = log_reg.predict_proba(X_test)[:,1]

lg_fpr, lg_tpr, _ = roc_curve(y_test, y_prob_test_lg)
lg_pre_, lg_rec_, __ = precision_recall_curve(y_test, y_prob_test_lg)
    
lg_acc=round(accuracy_score(y_test, y_pred_test_lg), 4)
lg_f1=round(f1_score(y_test, y_pred_test_lg, average='weighted', labels=np.unique(y_pred_test_lg)), 4)
lg_pre=round(precision_score(y_test,  y_pred_test_lg, average='weighted', labels=np.unique(y_pred_test_lg)), 4)
lg_rec=round(recall_score(y_test, y_pred_test_lg), 4)
lg_roc_auc=round(auc(lg_fpr, lg_tpr), 4)
lg_prc_auc=round(auc(lg_rec_, lg_pre_), 4)
results={'accuracy_score':lg_acc, 'f1_score':lg_f1,'precision_score':lg_pre, 'recall_score':lg_rec, 'AUC roc':lg_roc_auc,'AUC prc':lg_prc_auc}
results

{'accuracy_score': 0.9656,
 'f1_score': 0.9533,
 'precision_score': 0.9456,
 'recall_score': 0.0345,
 'AUC roc': 0.7067,
 'AUC prc': 0.1065}