In [128]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, BayesianRidge, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

import warnings
warnings.simplefilter(action='ignore')

In [85]:
cdiff_dir = "/data/volume02/CDAI/"

In [70]:
final_data = pd.read_csv("./final_data.csv", index_col=0)

In [71]:
final_data.head()

Unnamed: 0,icustay_id,age,gender,comobidities,ckd,cih,arf,diabetes,cardiop_disease,liver_disease,...,spo2,tempc,sysbp,diasbp,meanbp,vent,urineoutput,oasis,gcs,mort_hosp
0,200035,30.6612,1,1,0,0,0,1,0,0,...,92.0,36.277778,126.0,74.0,83.0,0,2950.0,24,15.0,1
1,200153,51.5414,0,0,0,0,0,0,0,0,...,100.0,36.888889,87.0,46.0,56.0,0,1820.0,21,15.0,0
2,200206,58.7653,1,0,0,0,1,0,0,0,...,100.0,36.222221,86.0,36.0,52.666698,0,,29,15.0,1
3,200550,79.5328,1,0,0,0,0,0,0,0,...,97.0,36.277778,130.0,75.0,90.0,0,2075.0,23,15.0,0
4,200608,71.1797,0,1,1,0,1,1,0,0,...,97.0,38.555556,138.0,55.0,76.0,0,2585.0,28,15.0,0


In [72]:
final_data.columns

Index(['icustay_id', 'age', 'gender', 'comobidities', 'ckd', 'cih', 'arf',
       'diabetes', 'cardiop_disease', 'liver_disease', 'ibd', 'malignancy',
       'aniongap', 'albumin', 'bands', 'bicarbonate', 'bilirubin',
       'creatinine', 'chloride', 'glucose_lab', 'hematocrit', 'hemoglobin',
       'lactate', 'platelet', 'potassium', 'ptt', 'inr', 'pt', 'sodium', 'bun',
       'wbc', 'calcium_1st', 'freecalcium_1st', 'heartrate', 'resprate',
       'glucose', 'spo2', 'tempc', 'sysbp', 'diasbp', 'meanbp', 'vent',
       'urineoutput', 'oasis', 'gcs', 'mort_hosp'],
      dtype='object')

In [74]:
data = final_data.copy()

In [75]:
variables = ['icustay_id', 
             'age', 'gender',
             'comobidities', 
             'ckd', 'cih', 'arf', 'diabetes', 
             'cardiop_disease', 'liver_disease', 'ibd', 'malignancy',
             'aniongap', 'albumin', 'bicarbonate', 'bilirubin', 
             'creatinine', 'chloride', 'glucose_lab', 'hematocrit', 'hemoglobin', 
             'lactate', 'platelet', 'potassium', 'ptt', 'inr', 'pt', 
             'sodium', 'bun', 'wbc', 'calcium_1st', 'freecalcium_1st', 'heartrate', 
             'resprate', 'spo2', 'tempc', 'sysbp', 'diasbp', 'meanbp', 'mort_hosp']
data = data[variables]

# Imputation

In [76]:
brm = BayesianRidge(n_iter=1000)
imputer = IterativeImputer(estimator=brm, sample_posterior=True, max_iter=25)

In [77]:
fulldata = data.copy()
imputer = imputer.fit(fulldata)

In [78]:
fulldata = imputer.fit_transform(fulldata)

In [79]:
imputed_df = pd.DataFrame(fulldata)
imputed_df.columns = data.columns
imputed_df.index = data.index

In [80]:
imputed_df.head()

Unnamed: 0,icustay_id,age,gender,comobidities,ckd,cih,arf,diabetes,cardiop_disease,liver_disease,...,calcium_1st,freecalcium_1st,heartrate,resprate,spo2,tempc,sysbp,diasbp,meanbp,mort_hosp
0,200035.0,30.6612,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,7.9,1.110914,118.0,33.0,92.0,36.277778,126.0,74.0,83.0,1.0
1,200153.0,51.5414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.2,1.065907,97.0,19.0,100.0,36.888889,87.0,46.0,56.0,0.0
2,200206.0,58.7653,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,8.0,0.933629,117.0,28.0,100.0,36.222221,86.0,36.0,52.666698,1.0
3,200550.0,79.5328,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.3,0.906488,96.0,23.0,97.0,36.277778,130.0,75.0,90.0,0.0
4,200608.0,71.1797,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,8.3,0.865719,97.0,22.0,97.0,38.555556,138.0,55.0,76.0,0.0


In [81]:
imputed_df.shape

(1315, 40)

In [153]:
features = ['icustay_id', 
            'age', 'gender', 
            'comobidities', 
            'ckd', 'cih', 'arf', 'diabetes', 
             'cardiop_disease', 'liver_disease', 'ibd', 'malignancy',
            'aniongap', 'albumin', 'bicarbonate', 'bilirubin', 
            'creatinine', 'chloride', 'glucose_lab', 'hematocrit', 'hemoglobin',
            'lactate', 'platelet', 'potassium', 'ptt', 'inr', 'pt', 
            'sodium', 'bun', 'wbc', 'calcium_1st', 'freecalcium_1st', 'heartrate', 
            'resprate', 'spo2', 'tempc', 'sysbp', 'diasbp', 'meanbp']
X = imputed_df[features]
y = imputed_df.mort_hosp

In [154]:
X.shape

(1315, 39)

In [155]:
X.head()

Unnamed: 0,icustay_id,age,gender,comobidities,ckd,cih,arf,diabetes,cardiop_disease,liver_disease,...,wbc,calcium_1st,freecalcium_1st,heartrate,resprate,spo2,tempc,sysbp,diasbp,meanbp
0,200035.0,30.6612,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,30.7,7.9,1.110914,118.0,33.0,92.0,36.277778,126.0,74.0,83.0
1,200153.0,51.5414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.6,8.2,1.065907,97.0,19.0,100.0,36.888889,87.0,46.0,56.0
2,200206.0,58.7653,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,13.2,8.0,0.933629,117.0,28.0,100.0,36.222221,86.0,36.0,52.666698
3,200550.0,79.5328,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.2,8.3,0.906488,96.0,23.0,97.0,36.277778,130.0,75.0,90.0
4,200608.0,71.1797,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,17.7,8.3,0.865719,97.0,22.0,97.0,38.555556,138.0,55.0,76.0


# CARDS

In [156]:
def cal_cards(row):
    age = row['age']
    arf = row['arf']
    diabetes = row['diabetes']
    cardiop_disease = row['cardiop_disease']
    liver_disease = row['liver_disease']
    ibd = row['ibd']
    malignancy = row['malignancy']
    
    score = 5.0 
    
    if age >= 18 and age <41: 
        age_score = 0
    elif age >= 41 and age <61: 
        age_score = 2
    elif age >= 61 and age <81: 
        age_score = 3
    elif age >= 81 and age <101: 
        age_score = 4
    
    arf_score = 3 if arf == 1 else 0
    db_score = -1 if diabetes == 1 else 0
    cd_score = 1 if cardiop_disease == 1 else 0
    ld_score = 2 if liver_disease == 1 else 0 
    ibd_score = 2 if ibd == 1 else 0 
    mag_score = 2 if malignancy == 1 else 0 
    
    final_score = sum([score, age_score, arf_score, db_score, 
                      cd_score, ld_score, ibd_score, mag_score])
    return final_score

In [157]:
X["CARDS"] = X.apply(cal_cards, axis=1)

# ATLAS

In [158]:
abx_use = pd.read_csv("/home/duhao/datathon2019/abx_use.csv", index_col=0)

In [159]:
def assign_abx(row, abx_use):
    icustay_id = row['icustay_id']
    list_of_use = abx_use['icustay_id'].unique()
    if icustay_id in list_of_use:
        row_abx_use = abx_use[abx_use['icustay_id']==icustay_id]['Used_abx'].values[0]
    else:
        row_abx_use = 0
    return row_abx_use

In [160]:
X['Used_abx'] = X.apply(lambda x: assign_abx(x, abx_use), axis=1)

In [162]:
X.head()

Unnamed: 0,icustay_id,age,gender,comobidities,ckd,cih,arf,diabetes,cardiop_disease,liver_disease,...,freecalcium_1st,heartrate,resprate,spo2,tempc,sysbp,diasbp,meanbp,CARDS,Used_abx
0,200035.0,30.6612,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.110914,118.0,33.0,92.0,36.277778,126.0,74.0,83.0,4.0,1
1,200153.0,51.5414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.065907,97.0,19.0,100.0,36.888889,87.0,46.0,56.0,7.0,1
2,200206.0,58.7653,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.933629,117.0,28.0,100.0,36.222221,86.0,36.0,52.666698,10.0,1
3,200550.0,79.5328,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.906488,96.0,23.0,97.0,36.277778,130.0,75.0,90.0,8.0,1
4,200608.0,71.1797,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.865719,97.0,22.0,97.0,38.555556,138.0,55.0,76.0,10.0,1


1055

# Feature selection

In [137]:
feats = ['age', 'gender', 
            'comobidities', 
#             'ckd', 'cih', 'arf', 'diabetes', 
#              'cardiop_disease', 'liver_disease', 'ibd', 'malignancy',
            'aniongap', 'albumin', 'bicarbonate', 'bilirubin', 
            'creatinine', 'chloride', 'glucose_lab', 'hematocrit', 'hemoglobin',
            'lactate', 'platelet', 'potassium', 'ptt', 'inr', 'pt', 
            'sodium', 'bun', 'wbc', 'calcium_1st', 'freecalcium_1st', 'heartrate', 
            'resprate', 'spo2', 'tempc', 'sysbp', 'diasbp', 'meanbp']

In [140]:
select_algo = ExtraTreesClassifier(n_estimators=250, random_state=0).fit(X[feats], y)
# select_algo = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X[feats], y)
# select_algo = RandomForestClassifier(n_estimators=250, random_state=0).fit(X[feats], y)
# select_algo = LassoCV().fit(X[feats], y)

In [141]:
X[feats].shape

(1315, 30)

In [142]:
model = SelectFromModel(select_algo, prefit=True, threshold='0.5*mean')

In [143]:
fullX_new = model.transform(X[feats])

In [144]:
fullX_new.shape

(1315, 29)

In [145]:
feature_idx = model.get_support()

In [146]:
selected_features = X[feats].columns[feature_idx]

In [147]:
selected_features

Index(['age', 'gender', 'aniongap', 'albumin', 'bicarbonate', 'bilirubin',
       'creatinine', 'chloride', 'glucose_lab', 'hematocrit', 'hemoglobin',
       'lactate', 'platelet', 'potassium', 'ptt', 'inr', 'pt', 'sodium', 'bun',
       'wbc', 'calcium_1st', 'freecalcium_1st', 'heartrate', 'resprate',
       'spo2', 'tempc', 'sysbp', 'diasbp', 'meanbp'],
      dtype='object')

In [148]:
for r_seed in range(100):
    fullX_train, fullX_test, fully_train, fully_test = train_test_split(X, y, 
                                                                        test_size=0.2, 
                                                                        random_state=r_seed)
    fullX_train.to_csv(os.path.join(cdiff_dir, "fullX_train_"+str(r_seed)+".csv"), index=False)
    fully_train.to_csv(os.path.join(cdiff_dir, "./fully_train_"+str(r_seed)+".csv"), index=False)
    fullX_test.to_csv(os.path.join(cdiff_dir, "./fullX_test_"+str(r_seed)+".csv"), index=False)
    fully_test.to_csv(os.path.join(cdiff_dir, "./fully_test_"+str(r_seed)+".csv"), index=False)