# **Data science workshop project**
**Team:** Elad, Tsach, Oz, Eva

**Problem:** The predictors of in-hospital mortality for admitted patients remain poorly characterized.

**Goal of the project**: We aimed to develop and validate a prediction model for all-cause in-hospital mortality among admitted patients.


In [1]:
#data processing
import pandas as pd
import numpy as np

#data visualizations
import seaborn as sns
import matplotlib.pyplot as plt


%matplotlib inline
#import plotly.expess as px -not working in Eva after installing with: pip install plotly

#Machine learning library
import sklearn

In [2]:
complete_data = pd.read_csv('data/dataset.csv') #load the dataset

In [3]:
complete_data.head() #beginning of the dataset

Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,...,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,Unnamed: 83,hospital_death
0,66154,25312,118,68.0,22.73,0,Caucasian,M,180.3,Floor,...,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,,0
1,114252,59342,81,77.0,27.42,0,Caucasian,F,160.0,Floor,...,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,,0
2,119783,50777,118,25.0,31.95,0,Caucasian,F,172.7,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,,0
3,79267,46918,118,81.0,22.64,1,Caucasian,F,165.1,Operating Room / Recovery,...,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,,0
4,92056,34377,33,19.0,,0,Caucasian,M,188.0,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma,,0


In [4]:
complete_data.shape   #dataset sizes: 85 columns and 91713 rows/patients in the original dataset

(91713, 85)

In [5]:
#column / features and data types
complete_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91713 entries, 0 to 91712
Data columns (total 85 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   encounter_id                   91713 non-null  int64  
 1   patient_id                     91713 non-null  int64  
 2   hospital_id                    91713 non-null  int64  
 3   age                            87485 non-null  float64
 4   bmi                            88284 non-null  float64
 5   elective_surgery               91713 non-null  int64  
 6   ethnicity                      90318 non-null  object 
 7   gender                         91688 non-null  object 
 8   height                         90379 non-null  float64
 9   icu_admit_source               91601 non-null  object 
 10  icu_id                         91713 non-null  int64  
 11  icu_stay_type                  91713 non-null  object 
 12  icu_type                       91713 non-null 

In [6]:
class_prediction = "hospital_death" #label


In [7]:
complete_data.drop(complete_data.columns[[0,1,83]], axis=1, inplace=True) #delete feature 83

In [8]:
numerical_features = ["age", "bmi","height", "weight", 
                      "pre_icu_los_days", "gcs_eyes_apache","apache_2_diagnosis",
                      "gcs_motor_apache", "gcs_verbal_apache", "heart_rate_apache",
                     "map_apache", "resprate_apache", "temp_apache", "d1_diasbp_max",
                      "d1_diasbp_min","d1_diasbp_noninvasive_max", "d1_diasbp_noninvasive_min",
                      "d1_heartrate_max", "d1_heartrate_min", "d1_mbp_max", "d1_mbp_min", 
                      "d1_mbp_noninvasive_max", "d1_mbp_noninvasive_min", "d1_resprate_max", "d1_resprate_min",
                      "d1_spo2_max", "d1_spo2_min", "d1_sysbp_max", "d1_sysbp_min",
                      "d1_sysbp_noninvasive_max", "d1_sysbp_noninvasive_min", "d1_temp_max", "d1_temp_min",
                      "h1_diasbp_max", "h1_diasbp_min", "h1_diasbp_noninvasive_max", "h1_diasbp_noninvasive_min",
                      "h1_heartrate_max", "h1_heartrate_min", "h1_mbp_max", "h1_mbp_min",
                      "h1_mbp_noninvasive_max", "h1_mbp_noninvasive_min", "h1_resprate_max", "h1_resprate_min",
                      "h1_spo2_max", "h1_spo2_min", "h1_sysbp_max", "h1_sysbp_min",
                      "h1_sysbp_noninvasive_max", "h1_sysbp_noninvasive_min", "d1_glucose_max", "d1_glucose_min",
                      "d1_potassium_max", "d1_potassium_min", "apache_4a_hospital_death_prob", "apache_4a_icu_death_prob",
                      "apache_3j_diagnosis"]

categorial_features = ["hospital_id", "ethnicity", "gender", "icu_admit_source", "apache_3j_bodysystem", "apache_2_bodysystem", "icu_stay_type", "icu_type"]

binary_features = ["arf_apache", "gcs_unable_apache", "intubated_apache", 
                   "ventilated_apache", "elective_surgery", "apache_post_operative",
                   "aids", "cirrhosis", "diabetes_mellitus", "hepatic_failure", "immunosuppression",
                   "leukemia", "lymphoma", "solid_tumor_with_metastasis"]

In [9]:
complete_data = pd.get_dummies(complete_data,columns=categorial_features)
complete_data = pd.get_dummies(complete_data,columns=binary_features)
#reminder: categorial_features = ["hospital_id", "ethnicity", "gender", "icu_admit_source", "apache_3j_bodysystem", "apache_2_bodysystem"]
complete_data.info(verbose=True) #at the end of the list: new columns for categorical features


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91713 entries, 0 to 91712
Data columns (total 280 columns):
 #    Column                                      Dtype  
---   ------                                      -----  
 0    age                                         float64
 1    bmi                                         float64
 2    height                                      float64
 3    icu_id                                      int64  
 4    pre_icu_los_days                            float64
 5    weight                                      float64
 6    apache_2_diagnosis                          float64
 7    apache_3j_diagnosis                         float64
 8    gcs_eyes_apache                             float64
 9    gcs_motor_apache                            float64
 10   gcs_verbal_apache                           float64
 11   heart_rate_apache                           float64
 12   map_apache                                  float64
 13   resprate_apach

In [10]:
# missing_data = complete_data[complete_data.isnull().any(axis=1)]

# Filling missing NUMERICAL values.
for feature in numerical_features:
  mean_value = complete_data[feature].mean()
  complete_data[feature].fillna(value=mean_value, inplace=True)


features_with_missing_values = complete_data.columns[complete_data.isnull().any()]
for f in features_with_missing_values:
    name = "missing " + f
    if f in numerical_features:
        complete_data[name] = (complete_data[f].isnull()).astype(int)




In [11]:
pd.set_option('display.max_rows', 100)

complete_data.isnull().sum(axis=0).sort_values(ascending=False)

age                                0
hospital_id_176                    0
hospital_id_183                    0
hospital_id_182                    0
hospital_id_181                    0
                                  ..
hospital_id_51                     0
hospital_id_53                     0
hospital_id_54                     0
hospital_id_55                     0
solid_tumor_with_metastasis_1.0    0
Length: 280, dtype: int64

In [12]:
true_values = complete_data.hospital_death
train_data = complete_data.drop('hospital_death', axis=1)


# Run Model

In [13]:
from Model.ModelModule import DSWorkshopModel 

model = DSWorkshopModel(complete_data)
model.print_details()

Data Shape: (91713, 280)
Data preview:


Unnamed: 0,age,bmi,height,icu_id,pre_icu_los_days,weight,apache_2_diagnosis,apache_3j_diagnosis,gcs_eyes_apache,gcs_motor_apache,...,hepatic_failure_0.0,hepatic_failure_1.0,immunosuppression_0.0,immunosuppression_1.0,leukemia_0.0,leukemia_1.0,lymphoma_0.0,lymphoma_1.0,solid_tumor_with_metastasis_0.0,solid_tumor_with_metastasis_1.0
0,68.0,22.73,180.3,92,0.541667,73.9,113.0,502.01,3.0,6.0,...,1,0,1,0,1,0,1,0,1,0
1,77.0,27.42,160.0,90,0.927778,70.2,108.0,203.01,1.0,3.0,...,1,0,1,0,1,0,1,0,1,0
2,25.0,31.95,172.7,93,0.000694,95.3,122.0,703.03,3.0,6.0,...,1,0,1,0,1,0,1,0,1,0
3,81.0,22.64,165.1,92,0.000694,61.7,203.0,1206.03,4.0,6.0,...,1,0,1,0,1,0,1,0,1,0
4,19.0,29.185818,188.0,91,0.073611,84.02834,119.0,601.01,3.465049,5.471195,...,1,0,1,0,1,0,1,0,1,0


In [14]:
# Preparing the data
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer




x_train, x_test, y_train, y_test = train_test_split(train_data, true_values, test_size=0.2, stratify=true_values, shuffle=True)


# imp = SimpleImputer(missing_values=np.nan, strategy='mean')
# imp = imp.fit(x_train)
# x_train = imp.transform(x_train)

In [15]:
model.set_split(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)

In [16]:
model.train()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
models_predictions, pred_results = model.test()

In [None]:
print(pred_results)

             Method  precision_score  recall_score  f1_score  accuracy_score  \
0      RandomForest         0.753906      0.243841  0.368496        0.927874   
1        ExtraTrees         0.790123      0.202148  0.321932        0.926511   
2  GradientBoosting         0.717523      0.300063  0.423163        0.929401   
3           Votingr         0.745327      0.201516  0.317255        0.925149   

    time   
0  42.23s  
1   44.9s  
2  94.47s  
3  39.86s  
