# Load data

## Execute this if using google colab and connecting it with google drive
## If not using google colab, navigate the script to your local drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Navigate data to its location
cd /content/drive/MyDrive/Colab Notebooks/WiDS_2021

/content/drive/MyDrive/Colab Notebooks/WiDS_2021


In [None]:
# Load libraries
import numpy as np
import pandas as pd

In [None]:
# load training data

training_data = pd.read_csv('TrainingWiDS2021.csv')

In [None]:
# Explore dataset
print(training_data.head(1))
print('training data size is:', training_data.shape)

   Unnamed: 0  encounter_id  ...  solid_tumor_with_metastasis  diabetes_mellitus
0           1        214826  ...                            0                  1

[1 rows x 181 columns]
training data size is: (130157, 181)


In [None]:
# Define independent and dependent variables
y = training_data['diabetes_mellitus']
training_data = training_data.drop(columns=['diabetes_mellitus'])
# encounter_id and hospital_id should not be invluded as predictors
training_data = training_data.drop(columns=['encounter_id', 'hospital_id'])
print(y.shape)
print(training_data.shape)

(130157,)
(130157, 178)


In [None]:
# Check if feature variables have NA values
training_data = training_data.iloc[:,1:]
training_data.isnull().any()

age                             True
bmi                             True
elective_surgery               False
ethnicity                       True
gender                          True
                               ...  
hepatic_failure                False
immunosuppression              False
leukemia                       False
lymphoma                       False
solid_tumor_with_metastasis    False
Length: 177, dtype: bool

In [None]:
# check data types
print(training_data.dtypes)

age                            float64
bmi                            float64
elective_surgery                 int64
ethnicity                       object
gender                          object
                                ...   
hepatic_failure                  int64
immunosuppression                int64
leukemia                         int64
lymphoma                         int64
solid_tumor_with_metastasis      int64
Length: 177, dtype: object


In [None]:
# check how many unique data types is in training_data
np.unique(training_data.dtypes)

array([dtype('int64'), dtype('float64'), dtype('O')], dtype=object)

In [None]:
# Check if independent variables have NA values
y.isnull().any()

False

In [None]:
# Assumming we want to keep all variables as predictors, 
# we need to create dummy variable all variables with NA values,  
# and change NA values to 0
# # Only do it to non-string variable for now

for c in training_data.columns:
  if training_data[c].isnull().any() and training_data[c].dtypes != np.dtype('O'):


    newcol = c+'_0_1'
    newcol_value = np.array(training_data[c])


    nanindex = training_data[c].index[training_data[c].apply(np.isnan)]

    newcol_value[nanindex] = 0

    newcol_value[newcol_value!= 0]  = 1

    training_data[str(newcol)] = newcol_value


In [None]:
# Clean categorical data (e.g. ethnicity or dtype('O'))

In [None]:
# Check new dataset
training_data.head()


Unnamed: 0.1,Unnamed: 0,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_id,icu_stay_type,icu_type,pre_icu_los_days,readmission_status,weight,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,arf_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2_apache,ph_apache,resprate_apache,sodium_apache,temp_apache,...,h1_calcium_max_0_1,h1_calcium_min_0_1,h1_creatinine_max_0_1,h1_creatinine_min_0_1,h1_glucose_max_0_1,h1_glucose_min_0_1,h1_hco3_max_0_1,h1_hco3_min_0_1,h1_hemaglobin_max_0_1,h1_hemaglobin_min_0_1,h1_hematocrit_max_0_1,h1_hematocrit_min_0_1,h1_inr_max_0_1,h1_inr_min_0_1,h1_lactate_max_0_1,h1_lactate_min_0_1,h1_platelets_max_0_1,h1_platelets_min_0_1,h1_potassium_max_0_1,h1_potassium_min_0_1,h1_sodium_max_0_1,h1_sodium_min_0_1,h1_wbc_max_0_1,h1_wbc_min_0_1,d1_arterial_pco2_max_0_1,d1_arterial_pco2_min_0_1,d1_arterial_ph_max_0_1,d1_arterial_ph_min_0_1,d1_arterial_po2_max_0_1,d1_arterial_po2_min_0_1,d1_pao2fio2ratio_max_0_1,d1_pao2fio2ratio_min_0_1,h1_arterial_pco2_max_0_1,h1_arterial_pco2_min_0_1,h1_arterial_ph_max_0_1,h1_arterial_ph_min_0_1,h1_arterial_po2_max_0_1,h1_arterial_po2_min_0_1,h1_pao2fio2ratio_max_0_1,h1_pao2fio2ratio_min_0_1
0,1,68.0,22.732803,0,Caucasian,M,180.3,Floor,Floor,92,admit,CTICU,0.541667,0,73.9,2.3,113.0,502.01,0,0,0.4,31.0,2.51,,3.0,6.0,0.0,4.0,168.0,118.0,27.4,0,40.0,,,,,36.0,134.0,39.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,77.0,27.421875,0,Caucasian,F,160.0,Floor,Floor,90,admit,Med-Surg ICU,0.927778,0,70.2,,108.0,203.01,0,0,,9.0,0.56,1.0,1.0,3.0,0.0,1.0,145.0,120.0,36.9,0,46.0,37.0,37.0,51.0,7.45,33.0,145.0,35.1,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,3,25.0,31.952749,0,Caucasian,F,172.7,Emergency Department,Accident & Emergency,93,admit,Med-Surg ICU,0.000694,0,95.3,,122.0,703.03,0,0,,,,,3.0,6.0,0.0,5.0,,102.0,,0,68.0,,,,,37.0,,36.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,81.0,22.635548,1,Caucasian,F,165.1,Operating Room,Operating Room / Recovery,92,admit,CTICU,0.000694,0,61.7,,203.0,1206.03,1,0,,,,0.6,4.0,6.0,0.0,5.0,185.0,114.0,25.9,1,60.0,30.0,30.0,142.0,7.39,4.0,,34.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,5,19.0,,0,Caucasian,M,188.0,,Accident & Emergency,91,admit,Med-Surg ICU,0.073611,0,,,119.0,601.01,0,0,,,,,,,,,,60.0,,0,103.0,,,,,16.0,,36.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
