In [1]:
import numpy as np
from pandas import pandas as pd
import time
import copy
from pprint import pprint

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer, classification_report, confusion_matrix
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

In [2]:
dt = pd.read_csv("../dataset_with_labels.csv", engine='python').drop('Unnamed: 0',axis=1).reset_index(drop=True)
data = copy.copy(dt)
data.head()

Unnamed: 0,subject_id,icustay_id,creatinine,age,arterial_pressure_systolic,arterial_pressure_systolic_delay,arterial_pressure_diastolic,arterial_pressure_diastolic_delay,heart_rate,heart_rate_delay,...,bilirubin,bilirubin_delay,c_reactive_protein,c_reactive_protein_delay,ethnicity,diagnosis,gender,creatinine_yesterday,creatinine_before_yesterday,label
0,77815,239231,0.6,58.363217,155.0,1800.0,75.0,1800.0,58.0,1800.0,...,,,,,WHITE,INTRACRANIAL HEMORRHAGE,M,,0.7,2.0
1,31558,249349,0.6,52.688716,165.0,1440.0,75.0,1440.0,102.0,1440.0,...,,,,,WHITE,S/P BOATING ACCIDENT,M,0.6,0.6,2.0
2,60897,282836,2.0,67.864024,84.0,4200.0,50.0,4200.0,104.0,4260.0,...,,,,,WHITE,SEPSIS,M,1.9,,1.0
3,85141,251157,0.7,72.399244,120.0,60.0,74.0,60.0,103.0,60.0,...,,,,,WHITE,LOWER GASTROINTESTINAL BLEED,M,,,2.0
4,27172,248284,1.4,53.707219,144.0,900.0,75.0,900.0,67.0,900.0,...,,,,,WHITE,ALCHOLIC CIRRHOSIS\EGD ** REMOTE EAST STONEMAN...,M,,1.3,2.0


In [3]:
data.shape

(36251, 64)

In [4]:
data = dt
# Remove outliers
feat_names = ['creatinine', 
#               'creatinine_yesterday', 
#               'creatinine_before_yesterday', 
              'diagnosis',
              'arterial_pressure_systolic',  
              'age', 
              'gender', 
              'arterial_pressure_diastolic',
              'heart_rate',  
              'weight_daily', 
              'temperature',  
              'ph_blood', 
              'ethnicity']
              

# ['creatinine_yesterday', 'creatinine_before_yesterday',
#               'urea', 'potassium','gender', 'ethnicity', 'diagnosis', 'gender', 'ethnicity', '',
#               'platelet_count', 'ph_blood',
#               'creatinine', 'age', 'bilirubin']
# feat_names = feat_names + ['arterial_pressure_systolic_delay', 'arterial_pressure_diastolic_delay',
#                            'heart_rate_delay', 'weight_daily_delay', 'temperature_delay']
feat_lab = feat_names + ['label']
print('Initial data length: ', len(data))
data = data[feat_lab]

# Remove outliers
data = data[data['creatinine'] < 20]
# data = data[data['creatinine_yesterday'] < 20]
# data = data[data['creatinine_before_yesterday'] < 20]
# data = data[data['potassium'] > 1.2]
# data = data[data['ph_blood'] > 6]
data = data[data['age'] < 110]
# data = data[data['bilirubin'] < 20]
data['gender'] = data['gender'].map({'F': 1, 'M': 0})

# Remove Nan
for name in feat_names:
    try:
        data = data[np.isfinite(data[name])]
    except:
        print(name)
        data = data[data[name].isnull() == False]
print('\nTotal entries: ', len(data))
print(data.isnull().sum())

Initial data length:  36251
diagnosis
ethnicity

Total entries:  19991
creatinine                     0
diagnosis                      0
arterial_pressure_systolic     0
age                            0
gender                         0
arterial_pressure_diastolic    0
heart_rate                     0
weight_daily                   0
temperature                    0
ph_blood                       0
ethnicity                      0
label                          0
dtype: int64


In [5]:
data['ethnicity'].unique()



data['ethnicity'] = data['ethnicity'].map({'BLACK/AFRICAN AMERICAN': 'BLACK', 
                                           'WHITE': 'WHITE', 
                                           'UNKNOWN/NOT SPECIFIED': 'NaN',
                                           'HISPANIC/LATINO - DOMINICAN': 'OTHER', 
                                           'UNABLE TO OBTAIN': 'NaN',
                                           'PATIENT DECLINED TO ANSWER': 'NaN', 
                                           'ASIAN - CHINESE': 'ASIAN',
                                           'AMERICAN INDIAN/ALASKA NATIVE': 'OTHER', 
                                           'MULTI RACE ETHNICITY': 'OTHER',
                                           'WHITE - OTHER EUROPEAN': 'WHITE', 
                                           'OTHER': 'OTHER', 
                                           'PORTUGUESE': 'WHITE',
                                           'HISPANIC OR LATINO': 'OTHER', 
                                           'ASIAN': 'ASIAN', 
                                           'HISPANIC/LATINO - PUERTO RICAN': 'OTHER',
                                           'MIDDLE EASTERN': 'OTHER', 
                                           'ASIAN - KOREAN': 'ASIAN', 
                                           'BLACK/HAITIAN': 'BLACK',
                                           'ASIAN - OTHER': 'ASIAN', 
                                           'HISPANIC/LATINO - CUBAN': 'OTHER', 
                                           'ASIAN - FILIPINO': 'ASIAN',
                                           'BLACK/CAPE VERDEAN': 'BLACK', 
                                           'WHITE - BRAZILIAN': 'WHITE', 
                                           'ASIAN - ASIAN INDIAN': 'ASIAN',
                                           'WHITE - EASTERN EUROPEAN': 'WHITE', 
                                           'HISPANIC/LATINO - GUATEMALAN': 'OTHER',
                                           'ASIAN - VIETNAMESE': 'ASIAN', 
                                           'HISPANIC/LATINO - MEXICAN': 'OTHER',
                                           'WHITE - RUSSIAN': 'WHITE', 
                                           'BLACK/AFRICAN': 'BLACK', 
                                           'ASIAN - CAMBODIAN': 'ASIAN',
                                           'AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE': 'OTHER'})
data['ethnicity'].unique()
data.tail()

Unnamed: 0,creatinine,diagnosis,arterial_pressure_systolic,age,gender,arterial_pressure_diastolic,heart_rate,weight_daily,temperature,ph_blood,ethnicity,label
35827,0.8,CORONARY ARTERY DISEASE,114.0,67.107265,0.0,63.0,119.0,128.2,98.3,7.41,WHITE,0.0
35828,0.5,CERVICAL MYELOPATHY,108.0,56.269295,1.0,59.0,54.0,51.0,98.6,7.47,WHITE,2.0
35829,3.6,COLON POLYP,113.0,86.46992,1.0,56.0,96.0,102.5,99.2,7.36,WHITE,2.0
35831,1.2,FEVER,114.0,28.854701,1.0,63.0,117.0,56.7,98.0,7.5,WHITE,0.0
35832,0.7,ABNORMAL ENDOSCOPY\EUS,114.0,77.37782,1.0,54.0,70.0,90.0,96.7,7.54,WHITE,2.0


In [6]:
len(data['diagnosis'].unique())
diag_to_stay = ['PNEUMONIA', 'CONGESTIVE HEART FAILURE', 'SUBARACHNOID HEMORRHAGE',
               'INTRACRANIAL HEMORRHAGE', 'ALTERED MENTAL STATUS', 'CORONARY ARTERY DISEASE',
               'ABDOMINAL PAIN', 'CHEST PAIN', 'HYPOTENSION', 'ACUTE RENAL FAILURE',
               'RESPIRATORY FAILURE', 'GASTROINTESTINAL BLEED', 'PANCREATITIS', 'SEPSIS']
'YS' in data['diagnosis'].iloc[1]
for i in range(len(data['diagnosis'])):
    value = data['diagnosis'].iat[i]
    if 'FEVER' in value:
        data['diagnosis'].iat[i] = 'SEPSIS'
    elif 'DYSPNEA' in value or 'SHORTNESS OF BREATH' in value:
        data['diagnosis'].iat[i] = 'RESPIRATORY FAILURE'
    elif value not in diag_to_stay:
        data['diagnosis'].iat[i] = 'OTHER'
data['diagnosis'].unique()

array(['OTHER', 'SEPSIS', 'INTRACRANIAL HEMORRHAGE',
       'SUBARACHNOID HEMORRHAGE', 'PANCREATITIS', 'PNEUMONIA',
       'HYPOTENSION', 'ABDOMINAL PAIN', 'CHEST PAIN',
       'RESPIRATORY FAILURE', 'CORONARY ARTERY DISEASE',
       'ALTERED MENTAL STATUS', 'CONGESTIVE HEART FAILURE',
       'GASTROINTESTINAL BLEED', 'ACUTE RENAL FAILURE'], dtype=object)

In [7]:
print(feat_names)
data = pd.get_dummies(data)
data.head()

['creatinine', 'diagnosis', 'arterial_pressure_systolic', 'age', 'gender', 'arterial_pressure_diastolic', 'heart_rate', 'weight_daily', 'temperature', 'ph_blood', 'ethnicity']


Unnamed: 0,creatinine,arterial_pressure_systolic,age,gender,arterial_pressure_diastolic,heart_rate,weight_daily,temperature,ph_blood,label,...,diagnosis_PANCREATITIS,diagnosis_PNEUMONIA,diagnosis_RESPIRATORY FAILURE,diagnosis_SEPSIS,diagnosis_SUBARACHNOID HEMORRHAGE,ethnicity_ASIAN,ethnicity_BLACK,ethnicity_NaN,ethnicity_OTHER,ethnicity_WHITE
1,0.6,165.0,52.688716,0.0,75.0,102.0,110.0,98.599998,7.28,2.0,...,0,0,0,0,0,0,0,0,0,1
2,2.0,84.0,67.864024,0.0,50.0,104.0,115.8,100.3,7.46,1.0,...,0,0,0,1,0,0,0,0,0,1
4,1.4,144.0,53.707219,0.0,75.0,67.0,54.6,97.5,7.36,2.0,...,0,0,0,0,0,0,0,0,0,1
5,0.6,116.0,55.968185,1.0,62.0,123.0,106.0,99.4,7.37,2.0,...,0,0,0,0,0,0,0,0,0,1
6,0.8,95.0,72.82619,1.0,41.0,101.0,84.199997,99.099998,7.41,2.0,...,0,0,0,1,0,0,0,0,0,1


## Model selection part

In [8]:
y = data['label']
feat_names = list(data)
feat_names.remove('label')
X = data[feat_names]
X.head()

Unnamed: 0,creatinine,arterial_pressure_systolic,age,gender,arterial_pressure_diastolic,heart_rate,weight_daily,temperature,ph_blood,diagnosis_ABDOMINAL PAIN,...,diagnosis_PANCREATITIS,diagnosis_PNEUMONIA,diagnosis_RESPIRATORY FAILURE,diagnosis_SEPSIS,diagnosis_SUBARACHNOID HEMORRHAGE,ethnicity_ASIAN,ethnicity_BLACK,ethnicity_NaN,ethnicity_OTHER,ethnicity_WHITE
1,0.6,165.0,52.688716,0.0,75.0,102.0,110.0,98.599998,7.28,0,...,0,0,0,0,0,0,0,0,0,1
2,2.0,84.0,67.864024,0.0,50.0,104.0,115.8,100.3,7.46,0,...,0,0,0,1,0,0,0,0,0,1
4,1.4,144.0,53.707219,0.0,75.0,67.0,54.6,97.5,7.36,0,...,0,0,0,0,0,0,0,0,0,1
5,0.6,116.0,55.968185,1.0,62.0,123.0,106.0,99.4,7.37,0,...,0,0,0,0,0,0,0,0,0,1
6,0.8,95.0,72.82619,1.0,41.0,101.0,84.199997,99.099998,7.41,0,...,0,0,0,1,0,0,0,0,0,1


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Split to three test sets
test_size = len(X_test) // 2
X_validation = X_test[0:test_size]
y_validation = y_test[0:test_size]
print(X_test)

X_test = X_test[test_size:-1]
y_test = y_test[test_size:-1]

       creatinine  arterial_pressure_systolic        age  gender  \
15557         0.2                       104.0  86.825648     1.0   
11568         0.4                       119.0  48.126454     1.0   
3326          0.9                       122.0  80.264874     1.0   
16266         5.7                       109.0  79.427704     0.0   
22511         0.8                        91.0  48.662637     0.0   
35632         0.6                       152.0  55.641212     0.0   
7844          0.5                       127.0  80.395380     1.0   
18284         0.8                       136.0  63.715976     1.0   
23495         3.0                       135.0  50.875892     0.0   
29185         2.0                        93.0  67.326460     0.0   
23358         0.6                       130.0  75.138445     1.0   
10248         1.3                       127.0  63.737116     1.0   
17947         1.0                       118.0  55.486068     0.0   
14539         0.2                       157.0  6

In [10]:
print(len(list(X)))
print(y.value_counts()/len(y))

29
2.0    0.622630
1.0    0.206143
0.0    0.171227
Name: label, dtype: float64


### Train a XGBoost model with hyperparameters optim

In [11]:
import warnings
warnings.filterwarnings("ignore")

def hyp_tuning(X_train, X_validation, y_train):
    # Initialize the classifier

    clf = XGBClassifier(
        #n_thread=4,
        eval_metric='mlogloss',
        num_class= 3,
        objective= 'multi:softmax'
    )
    
    # Create cross-validation sets from the training data
    cv_sets = StratifiedShuffleSplit(n_splits=3, 
                                     test_size=0.1, 
                                     train_size=None, 
                                     random_state=42)

    # Create the parameters list you wish to tune
    parameters = {
        #'num_boost_round': [100, 250],
        'learning_rate': [0.5],        
        'max_depth': [6],#, 9, 12],
        #'subsample': [0.9, 1.0],
        #'colsample_bytree': [0.9, 1.0],
    }
    
    # Make an fbeta_score scoring object
    scorer = make_scorer(fbeta_score, beta = 0.5, average = 'weighted')

    # Perform grid search on the classifier using 'scorer' as the scoring method
    grid_obj = GridSearchCV(clf, 
                            param_grid = parameters, 
                            scoring = scorer, 
                            cv = cv_sets, 
                            n_jobs = 1) #-1)

    # Fit the grid search object to the training data and find the optimal parameters
    grid_fit = grid_obj.fit(X_train, y_train)

    # Get the estimator
    best_clf = grid_fit.best_estimator_

    # Make predictions using the unoptimized and model
    best_predictions = best_clf.predict(X_validation)
    return best_clf, best_predictions

# Report the before-and-afterscores
start = time.time()

best_clf, best_predictions = hyp_tuning(X_train, X_validation, y_train)

end = time.time()

In [12]:
print("\nTime elapsed: ", (end - start)/60, " min")
print("\nBest classifier: ", best_clf)
conf = confusion_matrix(y_validation, best_predictions)
print("Confusion matrix")
pprint(conf)

print("\nClassification report")
print(classification_report(y_validation, best_predictions))


Time elapsed:  0.3548282027244568  min

Best classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, eval_metric='mlogloss', gamma=0,
       learning_rate=0.5, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, num_class=3, objective='multi:softprob',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
Confusion matrix
array([[  57,   83,  176],
       [  49,  146,  210],
       [  38,  101, 1139]])

Classification report
             precision    recall  f1-score   support

        0.0       0.40      0.18      0.25       316
        1.0       0.44      0.36      0.40       405
        2.0       0.75      0.89      0.81      1278

avg / total       0.63      0.67      0.64      1999



### Final test set

In [13]:
best_predictions_test = best_clf.predict(X_test)
conf = confusion_matrix(y_test, best_predictions_test)

print("Confusion matrix")
pprint(conf)

print("\nClassification report")
print(classification_report(y_test, best_predictions_test))

Confusion matrix
array([[  75,   84,  213],
       [  59,  144,  185],
       [  40,  105, 1094]])

Classification report
             precision    recall  f1-score   support

        0.0       0.43      0.20      0.27       372
        1.0       0.43      0.37      0.40       388
        2.0       0.73      0.88      0.80      1239

avg / total       0.62      0.66      0.63      1999

