In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [2]:
train_df = pd.read_csv("/home/student/filtered_train_df_0705.csv")

In [3]:
train_df.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,ID,Label,max_ICULOS,time_bm
0,,,,,,,,,,,...,1,1.0,0.0,-0.02,1,0,15354,0,46,-45
1,85.0,97.0,,88.0,58.0,,23.0,,,,...,1,1.0,0.0,-0.02,2,0,15354,0,46,-44
2,80.0,98.0,,89.0,54.0,,19.0,,,,...,1,1.0,0.0,-0.02,3,0,15354,0,46,-43
3,75.0,97.0,,84.0,51.0,,22.0,,,,...,1,1.0,0.0,-0.02,4,0,15354,0,46,-42
4,77.0,95.0,36.5,88.0,56.0,,27.0,,,,...,1,1.0,0.0,-0.02,5,0,15354,0,46,-41


In [7]:
frequency_used_attributes = ['BaseExcess',  'FiO2', 'pH', 'PaCO2', 'Glucose','Lactate', 'PTT']
values_used_attributes = [ 'Hct',  'Glucose','Potassium']
constant_attributes = ['ID','max_ICULOS','Gender']
other_attributes = ['time_bm','HR','MAP','O2Sat', 'Resp','SBP','ICULOS']
label_attributes= ['Label','SepsisLabel']

In [5]:
def add_rolling_window(df, attr, window_size):
    df = df.sort_values(by=['ID','ICULOS'], ascending =[True,True])
    rolling = df[['ID']+attr].groupby('ID').rolling(window=window_size, closed='both').count()
    rolling= rolling.rename(columns={at: f'{window_size}w_sum_{at}' for at in attr})
    rolling=rolling[list(rolling.columns)[1:]].reset_index().set_index('level_1')
    combined = df.join(rolling,how='left', rsuffix= 'r')
    return combined, rolling

In [15]:
window_size=5
train_df_with_roll, train_roll = add_rolling_window(train_df,frequency_used_attributes,window_size)



In [11]:
train_roll

Unnamed: 0_level_0,ID,5w_sum_BaseExcess,5w_sum_FiO2,5w_sum_pH,5w_sum_PaCO2,5w_sum_Glucose,5w_sum_Lactate,5w_sum_PTT
level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
75951,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75952,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75953,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75954,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75955,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
154924,19999,1.0,3.0,1.0,0.0,0.0,0.0,0.0
154925,19999,1.0,2.0,1.0,0.0,0.0,0.0,0.0
154926,19999,0.0,1.0,0.0,0.0,0.0,0.0,0.0
154927,19999,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [9]:
train_df_with_roll

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,max_ICULOS,time_bm,IDr,5w_sum_BaseExcess,5w_sum_FiO2,5w_sum_pH,5w_sum_PaCO2,5w_sum_Glucose,5w_sum_Lactate,5w_sum_PTT
75951,,,,,,,,,,,...,23,-22,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75952,61.0,99.0,36.44,124.0,65.0,43.0,17.5,,,,...,23,-21,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75953,64.0,98.0,,125.0,64.0,41.0,27.0,,,,...,23,-20,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75954,56.0,100.0,,123.0,65.0,41.0,9.0,,,,...,23,-19,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75955,66.0,99.0,,120.0,67.0,43.0,23.0,,,,...,23,-18,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154924,76.0,,,85.0,53.0,,17.0,,,,...,54,-4,19999,1.0,3.0,1.0,0.0,0.0,0.0,0.0
154925,81.0,99.0,,99.0,51.0,,17.0,,,,...,54,-3,19999,1.0,2.0,1.0,0.0,0.0,0.0,0.0
154926,85.0,100.0,,103.0,48.0,,26.0,,,,...,54,-2,19999,0.0,1.0,0.0,0.0,0.0,0.0,0.0
154927,86.0,93.0,,87.0,44.0,,22.0,,,,...,54,-1,19999,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [17]:
frequency_used_attributes_fixed = [f'{window_size}w_sum_{x}' for x in frequency_used_attributes]

In [19]:
train_df_with_roll = train_df_with_roll[constant_attributes + other_attributes + \
                                        values_used_attributes + frequency_used_attributes_fixed + \
                                        label_attributes]
train_df_with_roll = train_df_with_roll[train_df_with_roll['time_bm']>=-10]
train_df_with_roll

Unnamed: 0,ID,max_ICULOS,Gender,time_bm,HR,MAP,O2Sat,Resp,SBP,ICULOS,...,Potassium,5w_sum_BaseExcess,5w_sum_FiO2,5w_sum_pH,5w_sum_PaCO2,5w_sum_Glucose,5w_sum_Lactate,5w_sum_PTT,Label,SepsisLabel
75963,0,23,0,-10,62.0,72.0,95.0,12.0,139.0,13,...,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
75964,0,23,0,-9,63.0,75.0,95.0,11.0,143.0,14,...,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
75965,0,23,0,-8,63.0,81.0,97.0,11.0,154.0,15,...,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
75966,0,23,0,-7,58.0,58.0,94.0,10.0,114.0,16,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
75967,0,23,0,-6,54.0,50.5,97.0,12.0,,17,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154924,19999,54,0,-4,76.0,53.0,,17.0,85.0,50,...,,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0,0
154925,19999,54,0,-3,81.0,51.0,99.0,17.0,99.0,51,...,,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0,0
154926,19999,54,0,-2,85.0,48.0,100.0,26.0,103.0,52,...,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0
154927,19999,54,0,-1,86.0,44.0,93.0,22.0,87.0,53,...,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0


In [None]:
frequency_used_attributes = ['BaseExcess',  'FiO2', 'pH', 'PaCO2', 'Glucose','Lactate', 'PTT']
values_used_attributes = [ 'Hct',  'Glucose','Potassium']
constant_attributes = ['ID','max_ICULOS','Gender']
other_attributes = ['time_bm','HR','MAP','O2Sat', 'Resp','SBP','ICULOS']
label_attributes

In [22]:
grouped_single = train_df_with_roll.groupby('ID').agg({'max_ICULOS': 'mean',\
                                                       'Gender': 'mean',\
                                                       'HR': ['mean', 'median', 'max', 'min']})
grouped_single

Unnamed: 0_level_0,max_ICULOS,Gender,HR,HR,HR,HR
Unnamed: 0_level_1,mean,mean,mean,median,max,min
ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,23,0,58.636364,58.0,63.0,54.0
2,38,1,80.000000,81.0,89.0,69.0
3,48,1,70.090909,71.0,86.0,58.0
4,24,1,100.272727,98.0,107.0,94.0
5,25,1,62.900000,63.5,69.0,56.0
...,...,...,...,...,...,...
19995,29,1,110.400000,109.0,117.0,104.0
19996,43,1,64.545455,63.0,78.0,60.0
19997,15,0,72.818182,73.0,80.0,65.0
19998,42,1,82.833333,84.0,94.0,71.0


## Over-sampling using SMOTE

With our training data created, We’ll up-sample the minor class (Label=1) using the SMOTE algorithm (Synthetic Minority Oversampling Technique)

In [None]:
X = data_final.loc[:, data_final.columns != 'Label']
y = data_final.loc[:, data_final.columns == 'Label']

from imblearn.over_sampling import SMOTE ##TODO: Insert to ENV.yml

os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns
os_data_X, os_data_y = os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y = pd.DataFrame(data=os_data_y,columns=['y'])

# Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of Label=0 in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of Label=1",len(os_data_y[os_data_y['y']==1]))
print("Proportion of Label=0 data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of Label=1 data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

## Recursive Feature Elimination

Recursive Feature Elimination (RFE) is based on the idea to repeatedly construct a model and choose either the best or worst performing feature, setting the feature aside and then repeating the process with the rest of the features. This process is applied until all features in the dataset are exhausted. The goal of RFE is to select features by recursively considering smaller and smaller sets of features.

In [None]:
data_final_vars = data_final.columns.values.tolist()
y=['Label']
X=[i for i in data_final_vars if i not in y]

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
rfe = RFE(logreg, 20)
rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)

The RFE has helped us select the following features:

*TODO: COMPLETE *******

## Implementing the model

In [3]:
import statsmodels.api as sm

logit_model = sm.Logit(y,X)
result = logit_model.fit()
print(result.summary2())

NameError: name 'y' is not defined

#### ADJUSTMENTS????

### Logistic Regression Model Fitting

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

### Predicting the test set results and calculating the accuracy


In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

### Confusion Matrix

In [4]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

NameError: name 'y_test' is not defined

### Compute precision, recall, F-measure and support

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

### ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()