In [1]:
### Amit Balhara, Jade Zhang
### Data competition
### Extracted from Two notebooks - Traning_DT and Training_lgbm
### Visulizaiton & other works are in different files

In [105]:
# import packages
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score


In [47]:
# import the dataset

# training data
data = pd.read_csv('training_v2.csv')
# test data (for submission)
submission = pd.read_csv('unlabeled.csv')

In [90]:
# define the columns of the traning set
columns = [x for x in data.columns if x not in ['encounter_id','patient_id','hospital_death','readmission_status']]

In [49]:
columns

['hospital_id',
 'age',
 'bmi',
 'elective_surgery',
 'ethnicity',
 'gender',
 'height',
 'hospital_admit_source',
 'icu_admit_source',
 'icu_id',
 'icu_stay_type',
 'icu_type',
 'pre_icu_los_days',
 'weight',
 'albumin_apache',
 'apache_2_diagnosis',
 'apache_3j_diagnosis',
 'apache_post_operative',
 'arf_apache',
 'bilirubin_apache',
 'bun_apache',
 'creatinine_apache',
 'fio2_apache',
 'gcs_eyes_apache',
 'gcs_motor_apache',
 'gcs_unable_apache',
 'gcs_verbal_apache',
 'glucose_apache',
 'heart_rate_apache',
 'hematocrit_apache',
 'intubated_apache',
 'map_apache',
 'paco2_apache',
 'paco2_for_ph_apache',
 'pao2_apache',
 'ph_apache',
 'resprate_apache',
 'sodium_apache',
 'temp_apache',
 'urineoutput_apache',
 'ventilated_apache',
 'wbc_apache',
 'd1_diasbp_invasive_max',
 'd1_diasbp_invasive_min',
 'd1_diasbp_max',
 'd1_diasbp_min',
 'd1_diasbp_noninvasive_max',
 'd1_diasbp_noninvasive_min',
 'd1_heartrate_max',
 'd1_heartrate_min',
 'd1_mbp_invasive_max',
 'd1_mbp_invasive_min',


In [50]:
# find the categorical features and put them into a list
categorical_features = []
for x in columns:
    if (data[x].dtypes == 'object'):
        categorical_features.append(x)
categorical_features

['ethnicity',
 'gender',
 'hospital_admit_source',
 'icu_admit_source',
 'icu_stay_type',
 'icu_type',
 'apache_3j_bodysystem',
 'apache_2_bodysystem']

In [51]:
# join the categorical data in the two sets as a dataframe
categorical = pd.concat([data[categorical_features],submission[categorical_features]])

In [52]:
categorical.head()

Unnamed: 0,ethnicity,gender,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem
0,Caucasian,M,Floor,Floor,admit,CTICU,Sepsis,Cardiovascular
1,Caucasian,F,Floor,Floor,admit,Med-Surg ICU,Respiratory,Respiratory
2,Caucasian,F,Emergency Department,Accident & Emergency,admit,Med-Surg ICU,Metabolic,Metabolic
3,Caucasian,F,Operating Room,Operating Room / Recovery,admit,CTICU,Cardiovascular,Cardiovascular
4,Caucasian,M,,Accident & Emergency,admit,Med-Surg ICU,Trauma,Trauma


In [53]:
###### change the categorical variables to numbers, meanwhile skip the null values

In [54]:
#********************************
#********** this function is used to store the fit result of label encoders for the categorical columns
#********************************

def labelencoders(data, variables):
    labels=[]
    # for each column
    for x in variables:
        l = LabelEncoder()
        data_nomissing = list(data[x].dropna())
        labels.append(l.fit(data_nomissing))
    
    return labels

In [55]:
#********************************
#********** this function is used to transform the labels to normalized encoding (aka numbers)
#********************************
def encoding(data, l, variables):
    i = 0
    # for each column 
    for x in variables:
        data_nomissing = data[x].notnull()
        data.loc[data_nomissing,x]=l[i].transform(data.loc[data_nomissing,x])
        i=i+1

In [59]:
# get the label encoders for all the categorical columns
labelencoder = labelencoders(categorical, categorical_features)
labelencoder

[LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder()]

In [60]:
# label encoding two datasets
encoding(data,labelencoder,categorical_features)
encoding(submission,labelencoder,categorical_features)

In [61]:
data[categorical_features].head()

Unnamed: 0,ethnicity,gender,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem
0,2,1,4.0,1,0,2,9,0
1,2,0,4.0,1,0,5,8,6
2,2,0,3.0,0,0,5,5,3
3,2,0,8.0,2,0,2,0,0
4,2,1,,0,0,5,10,7


In [62]:
submission[categorical_features].head()

Unnamed: 0,ethnicity,gender,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem
0,2,1,3,0,0,6,7,4
1,2,0,8,2,0,5,7,4
2,0,1,8,2,0,0,0,0
3,2,1,8,2,0,5,8,6
4,2,1,3,0,0,6,7,4


In [65]:
submission[categorical_features].dtypes

ethnicity                object
gender                   object
hospital_admit_source    object
icu_admit_source         object
icu_stay_type             int32
icu_type                  int32
apache_3j_bodysystem     object
apache_2_bodysystem      object
dtype: object

In [66]:
# convert the data type of categorical features to float

# for each dataframe
for df in [data, submission]:
    # for each column
    for x in categorical_features:
        df[x] = df[x].astype(float)

In [70]:
# define the dependent variable
y = data['hospital_death']

In [71]:
# define the traning parameters
param = {'task': 'train',
         'boosting': 'gbdt',
         'objective':'binary',
         'metric': 'auc',
         'num_leaves': 64, #10-80
         'min_data_in_leaf': 64,  #40-180
         'learning_rate': 0.01,
         'max_depth': 10,          #10-50
         'feature_fraction': 0.1,
         'bagging_freq': 1,
         'bagging_fraction': 0.75,
         'use_missing': True,
         'nthread': 8
        }

In [107]:
# k-folds validation
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
# out of fold
oof = np.zeros(len(data))
# roc_auc_score
scores = []
# predictions
predictions = np.zeros(len(submission))

In [85]:
for fold_, (train_idx, test_idx) in enumerate(folds.split(data, y.values)):
    print(fold_, train_idx, test_idx)

0 [    0     1     2 ... 91707 91709 91710] [    5     7    12 ... 91708 91711 91712]
1 [    0     1     2 ... 91710 91711 91712] [    4     8    11 ... 91687 91689 91696]
2 [    1     2     3 ... 91710 91711 91712] [    0    13    14 ... 91702 91705 91706]
3 [    0     1     2 ... 91710 91711 91712] [    3    18    21 ... 91700 91704 91709]
4 [    0     3     4 ... 91709 91711 91712] [    1     2     6 ... 91701 91707 91710]


In [98]:
# get the indexes of the categorical features
categorical_index = [columns.index(x) for x in categorical_features]
categorical_index

[4, 5, 7, 8, 10, 11, 180, 181]

In [108]:
for fold_, (train_idx, test_idx) in enumerate(folds.split(data, y.values)):
    print("fold: ", fold_)
    # define training data and test data
    train_data = lgb.Dataset(data.iloc[train_idx][columns], label = y.iloc[train_idx])
    test_data = lgb.Dataset(data.iloc[test_idx][columns], label = y.iloc[test_idx], reference = train_data)
    
    # other parameters of train method
    num_boost_round = 7000
    early_stopping_rounds = 100
    verbose_eval = 200
    categorical_feature = categorical_index
    
    # train the model
    clf = lgb.train(param, train_data, num_boost_round = num_boost_round, valid_sets = test_data, early_stopping_rounds = early_stopping_rounds, verbose_eval = verbose_eval, categorical_feature = categorical_index)

    # prediction of every row
    oof[test_idx] = clf.predict(data.iloc[test_idx][columns], num_iteration = clf.best_iteration)
    
    # scores
    score = roc_auc_score(y.loc[test_idx], clf.predict(data.loc[test_idx, columns].values, num_iteration=clf.best_iteration))
    scores.append(score)

    # predictions
    predictions += clf.predict(submission[columns], num_iteration = clf.best_iteration) / folds.n_splits
    
    

fold:  0


New categorical_feature is [4, 5, 7, 8, 10, 11, 180, 181]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.895731
[400]	valid_0's auc: 0.901359
[600]	valid_0's auc: 0.904841
[800]	valid_0's auc: 0.906707
[1000]	valid_0's auc: 0.90803
[1200]	valid_0's auc: 0.908562
[1400]	valid_0's auc: 0.909095
Early stopping, best iteration is:
[1421]	valid_0's auc: 0.909193
0.5163657221181815
mean: 0.9091929540749942
std: 0.0
fold:  1


New categorical_feature is [4, 5, 7, 8, 10, 11, 180, 181]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.896058
[400]	valid_0's auc: 0.901782
[600]	valid_0's auc: 0.904919
[800]	valid_0's auc: 0.906786
[1000]	valid_0's auc: 0.907812
[1200]	valid_0's auc: 0.908671
[1400]	valid_0's auc: 0.90918
[1600]	valid_0's auc: 0.9095
[1800]	valid_0's auc: 0.909623
Early stopping, best iteration is:
[1882]	valid_0's auc: 0.909746
0.5655072963188845
mean: 0.9094693092026408
std: 0.0002763551276465659
fold:  2


New categorical_feature is [4, 5, 7, 8, 10, 11, 180, 181]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.891266
[400]	valid_0's auc: 0.896977
[600]	valid_0's auc: 0.900571
[800]	valid_0's auc: 0.902299
[1000]	valid_0's auc: 0.903311
[1200]	valid_0's auc: 0.904015
[1400]	valid_0's auc: 0.904769
[1600]	valid_0's auc: 0.905026
[1800]	valid_0's auc: 0.90529
[2000]	valid_0's auc: 0.905401
Early stopping, best iteration is:
[2053]	valid_0's auc: 0.90549
0.6468769806017742
mean: 0.9081428648965666
std: 0.0018893977255422917
fold:  3


New categorical_feature is [4, 5, 7, 8, 10, 11, 180, 181]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.894618
[400]	valid_0's auc: 0.900089
[600]	valid_0's auc: 0.903682
[800]	valid_0's auc: 0.905422
[1000]	valid_0's auc: 0.906338
[1200]	valid_0's auc: 0.90693
[1400]	valid_0's auc: 0.907346
[1600]	valid_0's auc: 0.907528
[1800]	valid_0's auc: 0.90777
[2000]	valid_0's auc: 0.907761
Early stopping, best iteration is:
[1913]	valid_0's auc: 0.907832
0.7610884661015207
mean: 0.9080650608409067
std: 0.0016418063945528654
fold:  4


New categorical_feature is [4, 5, 7, 8, 10, 11, 180, 181]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.896202
[400]	valid_0's auc: 0.902035
[600]	valid_0's auc: 0.905479
[800]	valid_0's auc: 0.907127
[1000]	valid_0's auc: 0.908254
[1200]	valid_0's auc: 0.90899
[1400]	valid_0's auc: 0.909405
[1600]	valid_0's auc: 0.909668
[1800]	valid_0's auc: 0.909956
[2000]	valid_0's auc: 0.910132
[2200]	valid_0's auc: 0.910411
[2400]	valid_0's auc: 0.91045
Early stopping, best iteration is:
[2388]	valid_0's auc: 0.910484
0.908459657000575
mean: 0.9085488596299779
std: 0.001758598209649655


In [109]:
AUC = roc_auc_score(y, oof)
print(AUC)
print ("mean: "+str(np.mean(np.array(scores))))
print ("std: "+str(np.std(np.array(scores))))

0.908459657000575
mean: 0.9085488596299779
std: 0.001758598209649655


In [110]:
predictions

array([0.0112759 , 0.01993335, 0.01637289, ..., 0.07142298, 0.00766487,
       0.13260973])

In [None]:
df_sub = pd.DataFrame({'encounter_id': df_ts['encounter_id']})
df_sub['hospital_death'] = predictions

df_sub.to_csv("sub1.csv",index=False)