In [30]:
#data processing
import pandas as pd
import numpy as np

#data visualizations
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

#Machine learning library
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from ModelModule import DSWorkshopModel  # separate python file which hold data and run models

In [31]:
numerical_features = ["age", "bmi","height", "weight", 
                      "pre_icu_los_days", "gcs_eyes_apache","apache_2_diagnosis",
                      "gcs_motor_apache", "gcs_verbal_apache", "heart_rate_apache",
                     "map_apache", "resprate_apache", "temp_apache", "d1_diasbp_max",
                      "d1_diasbp_min","d1_diasbp_noninvasive_max", "d1_diasbp_noninvasive_min",
                      "d1_heartrate_max", "d1_heartrate_min", "d1_mbp_max", "d1_mbp_min", 
                      "d1_mbp_noninvasive_max", "d1_mbp_noninvasive_min", "d1_resprate_max", "d1_resprate_min",
                      "d1_spo2_max", "d1_spo2_min", "d1_sysbp_max", "d1_sysbp_min",
                      "d1_sysbp_noninvasive_max", "d1_sysbp_noninvasive_min", "d1_temp_max", "d1_temp_min",
                      "h1_diasbp_max", "h1_diasbp_min", "h1_diasbp_noninvasive_max", "h1_diasbp_noninvasive_min",
                      "h1_heartrate_max", "h1_heartrate_min", "h1_mbp_max", "h1_mbp_min",
                      "h1_mbp_noninvasive_max", "h1_mbp_noninvasive_min", "h1_resprate_max", "h1_resprate_min",
                      "h1_spo2_max", "h1_spo2_min", "h1_sysbp_max", "h1_sysbp_min",
                      "h1_sysbp_noninvasive_max", "h1_sysbp_noninvasive_min", "d1_glucose_max", "d1_glucose_min",
                      "d1_potassium_max", "d1_potassium_min", "apache_4a_hospital_death_prob", "apache_4a_icu_death_prob",
                      "apache_3j_diagnosis"]

categorial_features = ["hospital_id", "ethnicity", "gender", "icu_admit_source", "apache_3j_bodysystem", "apache_2_bodysystem", "icu_stay_type", "icu_type"]

binary_features = ["arf_apache", "gcs_unable_apache", "intubated_apache", 
                   "ventilated_apache", "elective_surgery", "apache_post_operative",
                   "aids", "cirrhosis", "diabetes_mellitus", "hepatic_failure", "immunosuppression",
                   "leukemia", "lymphoma", "solid_tumor_with_metastasis"]

In [32]:
def getBasicDataset():
    complete_data = pd.read_csv('../data/dataset.csv') #load the dataset
    complete_data.drop(complete_data.columns[[0,1,83]], axis=1, inplace=True) #delete feature 83
    complete_data = pd.get_dummies(complete_data,columns=categorial_features) #one hot encoding
    return complete_data

In [33]:
# Filling missing NUMERICAL values.
def fill_missing_num_values_with_mean(complete_data):
    for feature in numerical_features:
        if feature in complete_data.columns:
            mean_value = complete_data[feature].mean()
            complete_data[feature].fillna(value=mean_value, inplace=True)
    return complete_data


In [34]:
def fill_missing_values_binary(df): 
    #add column for binary missing values
    binary_features_with_missing_values = df[binary_features].columns[df[binary_features].isnull().any()]
    for f in binary_features_with_missing_values:
        name = "missing " + f
        df[name] = (df[f].isnull()).astype(int)
    # change missing values to 0 in the original feature
    for f in binary_features:
        df[f] = df[f].fillna(0)
    return df

# Basic Check

In [35]:
df1 = getBasicDataset()
df1 = fill_missing_num_values_with_mean(df1)
df1 = fill_missing_values_binary(df1)

In [36]:
model = DSWorkshopModel(df1, number_of_trees=100)
model.print_details()

Data Shape: (91713, 278)
Data preview:


Unnamed: 0,age,bmi,elective_surgery,height,icu_id,pre_icu_los_days,weight,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,...,missing intubated_apache,missing ventilated_apache,missing aids,missing cirrhosis,missing diabetes_mellitus,missing hepatic_failure,missing immunosuppression,missing leukemia,missing lymphoma,missing solid_tumor_with_metastasis
0,68.0,22.73,0,180.3,92,0.541667,73.9,113.0,502.01,0,...,0,0,0,0,0,0,0,0,0,0
1,77.0,27.42,0,160.0,90,0.927778,70.2,108.0,203.01,0,...,0,0,0,0,0,0,0,0,0,0
2,25.0,31.95,0,172.7,93,0.000694,95.3,122.0,703.03,0,...,0,0,0,0,0,0,0,0,0,0
3,81.0,22.64,1,165.1,92,0.000694,61.7,203.0,1206.03,1,...,0,0,0,0,0,0,0,0,0,0
4,19.0,29.185818,0,188.0,91,0.073611,84.02834,119.0,601.01,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# Preparing the data
train_data = df1.drop('hospital_death', axis=1)
true_values = df1.hospital_death
x_train, x_test, y_train, y_test = train_test_split(train_data, true_values, test_size=0.2, stratify=true_values, shuffle=True)

In [38]:
model.set_split(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)

In [39]:
model.train()

In [40]:
models_predictions, pred_results = model.test()
print(pred_results)

          Method  Precision Score  Recall Score  F1 Score  \
0   RandomForest         0.778226      0.243841  0.371332   
1     ExtraTrees         0.799486      0.196462  0.315416   
2  XGBClassifier         0.684343      0.342388  0.456421   

   Unbalanced Accuracy Score  Balanced Accuracy Score  \
0                   0.928747                 0.618639   
1                   0.926402                 0.595904   
2                   0.929619                 0.663736   

   Positive Accuracy Score  Negative Accuracy Score Time Needed for Training  
0                  0.24384                  0.99344                   13.84s  
1                  0.19646                  0.99535                   14.86s  
2                  0.34239                  0.98508                   28.68s  


## Temporary Model for Testing

In [50]:
temp_df = getBasicDataset()
temp_df = fill_missing_num_values_with_mean(temp_df)
temp_df = fill_missing_values_binary(temp_df)

temp_model = DSWorkshopModel(temp_df, number_of_trees=200)
temp_model.print_details()

temp_train_data = temp_df.drop('hospital_death', axis=1)
temp_true_values = temp_df.hospital_death
x_train, x_test, y_train, y_test = train_test_split(temp_train_data, temp_true_values, test_size=0.2, stratify=true_values, shuffle=True)

Data Shape: (91713, 278)
Data preview:


In [51]:
temp_model.set_split(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)

In [52]:
temp_model.train()

In [53]:
temp_models_predictions, temp_pred_results = temp_model.test()
print(temp_pred_results)

          Method  Precision Score  Recall Score  F1 Score  \
0   RandomForest         0.738964      0.243209  0.365970   
1     ExtraTrees         0.760705      0.190777  0.305051   
2  XGBClassifier         0.648235      0.348073  0.452939   

   Unbalanced Accuracy Score  Balanced Accuracy Score  \
0                   0.927275                 0.617547   
1                   0.924985                 0.592554   
2                   0.927438                 0.665117   

   Positive Accuracy Score  Negative Accuracy Score Time Needed for Training  
0                  0.24321                  0.99189                   29.12s  
1                  0.19078                  0.99433                   32.77s  
2                  0.34807                  0.98216                    67.3s  
