In [2]:
print("OmriShemTov Hatich Shel Yeled")

OmriShemTov Hatich Shel Yeled


In [3]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
from sklearn.impute import KNNImputer
from imblearn.over_sampling import SMOTE ##TODO: Insert to ENV.yml
from imblearn.under_sampling import RandomUnderSampler
import statsmodels.api as sm
import random
seed=42
random.seed(seed)

In [4]:
def lr_preprocess_sepsis(df, window_size=5, time_bm=-10):

    # columns for using
    frequency_used_attributes = ['BaseExcess',  'FiO2', 'pH', 'PaCO2', 'Glucose','Lactate', 'PTT']
    values_used_attributes = [ 'Hct', 'Glucose','Potassium']
    constant_attributes = ['ID','max_ICULOS', 'Gender']
    other_attributes = ['time_bm','HR','MAP','O2Sat', 'Resp','SBP', 'ICULOS']
    units_attributes = ['Unit1', 'Unit2']
    label_attributes= ['Label','SepsisLabel']

    # create frequency columns for some lab variables
    def add_rolling_window(df, attr, window_size):
        df = df.sort_values(by=['ID','ICULOS'], ascending =[True,True])
        rolling = df[['ID']+attr].groupby('ID').rolling(window=window_size, closed='both').count()
        rolling= rolling.rename(columns={at: f'{window_size}w_sum_{at}' for at in attr})
        rolling=rolling[list(rolling.columns)[1:]].reset_index().set_index('level_1')
        combined = df.join(rolling,how='left', rsuffix= 'r')
        return combined, rolling
    df_with_roll, df_roll = add_rolling_window(df,frequency_used_attributes,window_size)
    frequency_used_attributes_fixed = [f'{window_size}w_sum_{x}' for x in frequency_used_attributes]
    df_with_roll = df_with_roll[constant_attributes + other_attributes + \
                                        values_used_attributes + frequency_used_attributes_fixed + \
                                        units_attributes + label_attributes]

    # crop 10 (time_bm) last ICULOS hours for each patient
    df_with_roll = df_with_roll[df_with_roll['time_bm']>=time_bm]

    # handle Units123
    df_with_roll['Unit3'] = ( (1*(df_with_roll['Unit1']+df_with_roll['Unit2'])<1) |
                          (df_with_roll['Unit1'].isna() & df_with_roll['Unit2'].isna()) )*1
    df_with_roll['Unit1'][df_with_roll['Unit1'].isna()] = 0
    df_with_roll['Unit2'][df_with_roll['Unit2'].isna()] = 0
    df_with_roll[['Unit1','Unit2','Unit3']]

    # aggregations
    data_final = df_with_roll.groupby(['ID', 'Label','max_ICULOS','Gender']).agg({
                                                        'Unit1': 'max',\
                                                        'Unit2': 'max',\
                                                        'Unit3': 'max',\
                                                        'HR': ['median', 'max'],\
                                                        'MAP': ['median', 'min'],\
                                                        'O2Sat': ['mean'],\
                                                        'Resp': ['median', 'max'],\
                                                        'SBP': ['median', 'min'],\
                                                        'Hct': ['median', 'min'],\
                                                        'Potassium': 'mean',\
                                                        'Glucose': 'mean',\
                                                        f'{window_size}w_sum_BaseExcess': 'mean',\
                                                        f'{window_size}w_sum_FiO2': 'mean',\
                                                        f'{window_size}w_sum_pH': 'mean',\
                                                        f'{window_size}w_sum_PaCO2': 'mean',\
                                                        f'{window_size}w_sum_Glucose': 'mean',\
                                                        f'{window_size}w_sum_Lactate': 'mean',\
                                                        f'{window_size}w_sum_PTT': 'mean'}).reset_index()
    data_final.columns = ['__'.join(col).strip() for col in data_final.columns.values]
    data_final.rename(columns={"ID__": "ID", "Label__": "Label", "max_ICULOS__":"max_ICULOS", "Gender__":"Gender"}, inplace=True)

    return data_final


In [5]:
def imputation_with_KNNimputer(train_df, n=3):
    data_knn_imputed = train_df.copy(deep=True)    # Copy the data
    knn_imp = KNNImputer(n_neighbors=n) # Init the transformer
    knn_imp.fit(data_knn_imputed)
    return knn_imp

In [6]:
def os_with_smote(train_df, p=0.5):
    X = train_df.loc[:, train_df.columns != 'Label']
    y = train_df.loc[:, train_df.columns == 'Label']
    os = SMOTE(sampling_strategy=p, random_state=0)
    columns = X.columns
    os_data_X, os_data_y = os.fit_resample(X, y)
    os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
    os_data_y = pd.DataFrame(data=os_data_y,columns=['Label'])
    return os_data_X, os_data_y

In [7]:
train_df = pd.read_csv("/home/student/filtered_train_df_0705.csv")
val_df = pd.read_csv("/home/student/filtered_val_df_0705.csv")
test_df =  pd.read_csv("/home/student/filtered_test_df_0705.csv")

In [8]:
train_df = lr_preprocess_sepsis(train_df)
# Check the numbers of our data
print("length of train data is",len(train_df))
print("Number of Label=0 in train data",len(train_df[train_df['Label']==0]))
print("Number of Label=1 in train data",len(train_df[train_df['Label']==1]))
print("Proportion of Label=0 data in train data is ",len(train_df[train_df['Label']==0])/len(train_df))
print("Proportion of Label=1 data in train data is ",len(train_df[train_df['Label']==1])/len(train_df))



length of train data is 16000
Number of Label=0 in train data 14857
Number of Label=1 in train data 1143
Proportion of Label=0 data in train data is  0.9285625
Proportion of Label=1 data in train data is  0.0714375


In [9]:
# imputing training data
# TODO: Save Imputer
knn_imp = imputation_with_KNNimputer(train_df, 3)
train_df.loc[:, :] = knn_imp.transform(train_df)

In [10]:
# over-sampling training data
X, y = os_with_smote(train_df, p=0.25)

# Check the numbers of our data
print("length of oversampled data is ",len(X))
print("Number of Label=0 in oversampled data",len(y[y['Label']==0]))
print("Number of Label=1",len(y[y['Label']==1]))
print("Proportion of Label=0 data in oversampled data is ",len(y[y['Label']==0])/len(X))
print("Proportion of Label=1 data in oversampled data is ",len(y[y['Label']==1])/len(X))

length of oversampled data is  18571
Number of Label=0 in oversampled data 14857
Number of Label=1 3714
Proportion of Label=0 data in oversampled data is  0.8000107694792957
Proportion of Label=1 data in oversampled data is  0.19998923052070433


In [11]:
#Downsample majority class
under = RandomUnderSampler(sampling_strategy=0.5)
X,y = under.fit_resample(X, y)

# Check the numbers of our data
print("length of final training data is ",len(X))
print("Number of Label=0 in final training data",len(y[y['Label']==0]))
print("Number of Label=1",len(y[y['Label']==1]))
print("Proportion of Label=0 data in final data training is ",len(y[y['Label']==0])/len(X))
print("Proportion of Label=1 data in final training data is ",len(y[y['Label']==1])/len(X))

length of final training data is  11142
Number of Label=0 in final training data 7428
Number of Label=1 3714
Proportion of Label=0 data in final data training is  0.6666666666666666
Proportion of Label=1 data in final training data is  0.3333333333333333


In [12]:
val_df = lr_preprocess_sepsis(val_df)
val_df.loc[:, :] = knn_imp.transform(val_df)

test_df = lr_preprocess_sepsis(test_df)
test_df.loc[:, :] = knn_imp.transform(test_df)



In [13]:
X_val = val_df.loc[:, val_df.columns != 'Label']
y_val = val_df.loc[:, val_df.columns == 'Label']

X_test = test_df.loc[:, test_df.columns != 'Label']
y_test = test_df.loc[:, test_df.columns == 'Label']

In [14]:
logit_model = sm.Logit(y,X)
result = logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.553699
         Iterations 6
                              Results: Logit
Model:                  Logit               Pseudo R-squared:    0.130     
Dependent Variable:     Label               AIC:                 12390.6238
Date:                   2022-05-09 10:14    BIC:                 12580.9042
No. Observations:       11142               Log-Likelihood:      -6169.3   
Df Model:               25                  LL-Null:             -7092.0   
Df Residuals:           11116               LLR p-value:         0.0000    
Converged:              1.0000              Scale:               1.0000    
No. Iterations:         6.0000                                             
---------------------------------------------------------------------------
                         Coef.   Std.Err.    z     P>|z|   [0.025   0.975] 
---------------------------------------------------------------------------
ID               

In [15]:
logreg = LogisticRegression()
logreg.fit(X, y)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [16]:
y_pred = logreg.predict(X_val)

In [17]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_val, y_val)))
print(f"exact: {(val_df['Label']==y_pred).mean()}")

Accuracy of logistic regression classifier on test set: 0.91
exact: 0.91425


In [18]:
(val_df['Label']==y_pred).mean()

0.91425

In [19]:
from sklearn.metrics import f1_score

print('Val F1: ', f1_score(y_val, y_pred, average='macro'))

# print(f1_score(y_val, y_pred, average='micro'))
# print(f1_score(y_val, y_pred, average='weighted'))
# print(f1_score(y_val, y_pred, average=None))

Val F1:  0.5901726263392947


In [20]:
from xgboost import XGBClassifier

xgbc = XGBClassifier()
xgbc.fit(X, y)
y_train_pred = xgbc.predict(X)
print(f'Train F1: {f1_score(y,y_train_pred)}')
y_val_pred = xgbc.predict(X_val)
print(f'Val F1: {f1_score(y_val,y_val_pred)}')
y_test_pred = xgbc.predict(X_test)
print(f'Val F1: {f1_score(y_test,y_test_pred)}')

Train F1: 0.9951298701298701
Val F1: 0.7441860465116278
Val F1: 0.690576652601969


In [21]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X,y)
y_train_pred = clf.predict(X)
print(f'Train F1: {f1_score(y, y_train_pred)}')
y_val_pred = clf.predict(X_val)
print(f'Val F1: {f1_score(y_val, y_val_pred)}')
y_test_pred = clf.predict(X_test)
print(f'Val F1: {f1_score(y_test, y_test_pred)}')

  clf.fit(X,y)


Train F1: 1.0
Val F1: 0.7269230769230769
Val F1: 0.6990014265335237


In [22]:
# clf.summary()