In [None]:
import os
import pandas as pd
import numpy as np

from sklearn import preprocessing
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import Imputer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
from IPython.display import display, HTML

## Data Preprocessing for Modeling

In [None]:
df = pd.read_csv("./data/vent_trach_cohort_clean25May2020.csv")

df.head()

In [None]:
#DROP COLUMNS: SUBJECT_ID, HADM_ID, ICUSTAY_ID, ETHNICITY, COHORT
df = df.drop(columns=['SUBJECT_ID', 'HADM_ID', 'ETHNICITY', 'COHORT'])

#OneHotEncode: ADMISSION_TYPE, ADMISSION_LOCATION, FIRST_CAREUNIT, GENDER
df = pd.get_dummies(df, prefix=['ADMISSION_TYPE', 'ADMISSION_LOCATION', 'FIRST_CAREUNIT', 'GENDER'], columns=['ADMISSION_TYPE', 'ADMISSION_LOCATION', 'FIRST_CAREUNIT', 'GENDER'])

#Categorize Strings: OUTCOME, GENDER
le = LabelEncoder()
df['MORTALITY_3MO_DISCH'] = le.fit_transform(df['MORTALITY_3MO_DISCH'].tolist())
df['MORTALITY_1YR_DISCH'] = le.fit_transform(df['MORTALITY_1YR_DISCH'].tolist())

## Before anymore preprocessing or splitting, save as tmp for ProVent logistic regression
tmp = df

In [None]:
# Standard Scaler for normal distributions

# General
## Scores
df[["ICU_D1_APSIII", "ICU_D1_OASIS",  "ICU_D1_SOFA", "ICU_D1_LODS", "ICU_D1_SAPSII"]] = preprocessing.scale(df[["ICU_D1_APSIII", "ICU_D1_OASIS",  "ICU_D1_SOFA", "ICU_D1_LODS", "ICU_D1_SAPSII"]])
## Misc
df[["HEIGHT_AVG", "ADMISSION_AGE"]] = preprocessing.scale(df[["HEIGHT_AVG", "ADMISSION_AGE"]])

#D1
## Lab values
df[["LMVD1_SODIUM_MIN",  "LMVD1_SODIUM_MAX", "LMVD1_PLATELET_MIN", "LMVD1_PLATELET_MAX",  "LMVD1_POTASSIUM_MIN", "LMVD1_POTASSIUM_MAX", "LMVD1_CHLORIDE_MIN",  "LMVD1_CHLORIDE_MAX",  "LMVD1_GLUCOSE_MIN", "LMVD1_GLUCOSE_MAX", "LMVD1_HEMATOCRIT_MIN",  "LMVD1_HEMATOCRIT_MAX",  "LMVD1_HEMOGLOBIN_MIN",  "LMVD1_HEMOGLOBIN_MAX", "LMVD1_BICARBONATE_MIN",  "LMVD1_BICARBONATE_MAX", "LMVD1_ANIONGAP_MIN",  "LMVD1_ANIONGAP_MAX",  "LMVD1_ALBUMIN_MIN", "LMVD1_ALBUMIN_MAX"]] = preprocessing.scale(df[["LMVD1_SODIUM_MIN",  "LMVD1_SODIUM_MAX", "LMVD1_PLATELET_MIN", "LMVD1_PLATELET_MAX",  "LMVD1_POTASSIUM_MIN", "LMVD1_POTASSIUM_MAX", "LMVD1_CHLORIDE_MIN",  "LMVD1_CHLORIDE_MAX",  "LMVD1_GLUCOSE_MIN", "LMVD1_GLUCOSE_MAX", "LMVD1_HEMATOCRIT_MIN",  "LMVD1_HEMATOCRIT_MAX",  "LMVD1_HEMOGLOBIN_MIN",  "LMVD1_HEMOGLOBIN_MAX", "LMVD1_BICARBONATE_MIN",  "LMVD1_BICARBONATE_MAX", "LMVD1_ANIONGAP_MIN",  "LMVD1_ANIONGAP_MAX",  "LMVD1_ALBUMIN_MIN", "LMVD1_ALBUMIN_MAX"]])
df[["LD1_SODIUM_MIN", "LD1_SODIUM_MAX", "LD1_PLATELET_MIN", "LD1_PLATELET_MAX", "LD1_POTASSIUM_MIN", "LD1_POTASSIUM_MAX", "LD1_CHLORIDE_MIN", "LD1_CHLORIDE_MAX", "LD1_GLUCOSE_MIN", "LD1_GLUCOSE_MAX", "LD1_HEMATOCRIT_MIN", "LD1_HEMATOCRIT_MAX", "LD1_HEMOGLOBIN_MIN", "LD1_HEMOGLOBIN_MAX", "LD1_BICARBONATE_MIN", "LD1_BICARBONATE_MAX", "LD1_ANIONGAP_MIN", "LD1_ANIONGAP_MAX", "LD1_ALBUMIN_MIN", "LD1_ALBUMIN_MAX"]] = preprocessing.scale(df[["LD1_SODIUM_MIN", "LD1_SODIUM_MAX", "LD1_PLATELET_MIN", "LD1_PLATELET_MAX", "LD1_POTASSIUM_MIN", "LD1_POTASSIUM_MAX", "LD1_CHLORIDE_MIN", "LD1_CHLORIDE_MAX", "LD1_GLUCOSE_MIN", "LD1_GLUCOSE_MAX", "LD1_HEMATOCRIT_MIN", "LD1_HEMATOCRIT_MAX", "LD1_HEMOGLOBIN_MIN", "LD1_HEMOGLOBIN_MAX", "LD1_BICARBONATE_MIN", "LD1_BICARBONATE_MAX", "LD1_ANIONGAP_MIN", "LD1_ANIONGAP_MAX", "LD1_ALBUMIN_MIN", "LD1_ALBUMIN_MAX"]])

## Vital Signs
df[["VMVD1_GLUCOSE_MIN", "VMVD1_GLUCOSE_MAX", "VMVD1_GLUCOSE_MEAN", "VMVD1_RESPRATE_MAX", "VMVD1_RESPRATE_MEAN", "VMVD1_TEMPC_MIN", "VMVD1_TEMPC_MAX", "VMVD1_TEMPC_MEAN", "VMVD1_HEARTRATE_MEAN", "VMVD1_SYSBP_MIN", "VMVD1_SYSBP_MAX", "VMVD1_SYSBP_MEAN", "VMVD1_DIASBP_MIN", "VMVD1_DIASBP_MAX", "VMVD1_DIASBP_MEAN", "VMVD1_MEANBP_MIN", "VMVD1_MEANBP_MAX", "VMVD1_MEANBP_MEAN", "VMVD1_RESPRATE_MIN", "VMVD1_HEARTRATE_MIN", "VMVD1_HEARTRATE_MAX"]] = preprocessing.scale(df[["VMVD1_GLUCOSE_MIN", "VMVD1_GLUCOSE_MAX", "VMVD1_GLUCOSE_MEAN", "VMVD1_RESPRATE_MAX", "VMVD1_RESPRATE_MEAN", "VMVD1_TEMPC_MIN", "VMVD1_TEMPC_MAX", "VMVD1_TEMPC_MEAN", "VMVD1_HEARTRATE_MEAN", "VMVD1_SYSBP_MIN", "VMVD1_SYSBP_MAX", "VMVD1_SYSBP_MEAN", "VMVD1_DIASBP_MIN", "VMVD1_DIASBP_MAX", "VMVD1_DIASBP_MEAN", "VMVD1_MEANBP_MIN", "VMVD1_MEANBP_MAX", "VMVD1_MEANBP_MEAN", "VMVD1_RESPRATE_MIN", "VMVD1_HEARTRATE_MIN", "VMVD1_HEARTRATE_MAX"]])
df[["VD1_GLUCOSE_MIN", "VD1_GLUCOSE_MAX", "VD1_GLUCOSE_MEAN", "VD1_MEANBP_MIN", "VD1_MEANBP_MAX", "VD1_MEANBP_MEAN", "VD1_RESPRATE_MIN", "VD1_RESPRATE_MAX", "VD1_RESPRATE_MEAN", "VD1_TEMPC_MIN", "VD1_TEMPC_MAX", "VD1_TEMPC_MEAN", "VD1_HEARTRATE_MIN", "VD1_HEARTRATE_MAX", "VD1_HEARTRATE_MEAN", "VD1_SYSBP_MIN", "VD1_SYSBP_MAX", "VD1_SYSBP_MEAN", "VD1_DIASBP_MIN", "VD1_DIASBP_MAX", "VD1_DIASBP_MEAN"]] = preprocessing.scale(df[["VD1_GLUCOSE_MIN", "VD1_GLUCOSE_MAX", "VD1_GLUCOSE_MEAN", "VD1_MEANBP_MIN", "VD1_MEANBP_MAX", "VD1_MEANBP_MEAN", "VD1_RESPRATE_MIN", "VD1_RESPRATE_MAX", "VD1_RESPRATE_MEAN", "VD1_TEMPC_MIN", "VD1_TEMPC_MAX", "VD1_TEMPC_MEAN", "VD1_HEARTRATE_MIN", "VD1_HEARTRATE_MAX", "VD1_HEARTRATE_MEAN", "VD1_SYSBP_MIN", "VD1_SYSBP_MAX", "VD1_SYSBP_MEAN", "VD1_DIASBP_MIN", "VD1_DIASBP_MAX", "VD1_DIASBP_MEAN"]])
## Misc
df[["WEIGHTD1_WEIGHT_AVG",  "WEIGHTMVD1_WEIGHT_AVG",  "BMID1",  "BMIMVD1"]] = preprocessing.scale(df[["WEIGHTD1_WEIGHT_AVG",  "WEIGHTMVD1_WEIGHT_AVG",  "BMID1",  "BMIMVD1"]])

#D7
## Lab values
df[["LMVD7_SODIUM_MIN",  "LMVD7_SODIUM_MAX", "LMVD7_PLATELET_MIN", "LMVD7_PLATELET_MAX",  "LMVD7_POTASSIUM_MIN", "LMVD7_POTASSIUM_MAX", "LMVD7_CHLORIDE_MIN",  "LMVD7_CHLORIDE_MAX",  "LMVD7_GLUCOSE_MIN", "LMVD7_GLUCOSE_MAX", "LMVD7_HEMATOCRIT_MIN",  "LMVD7_HEMATOCRIT_MAX",  "LMVD7_HEMOGLOBIN_MIN",  "LMVD7_HEMOGLOBIN_MAX", "LMVD7_BICARBONATE_MIN",  "LMVD7_BICARBONATE_MAX", "LMVD7_ANIONGAP_MIN",  "LMVD7_ANIONGAP_MAX",  "LMVD7_ALBUMIN_MIN", "LMVD7_ALBUMIN_MAX"]] = preprocessing.scale(df[["LMVD7_SODIUM_MIN",  "LMVD7_SODIUM_MAX", "LMVD7_PLATELET_MIN", "LMVD7_PLATELET_MAX",  "LMVD7_POTASSIUM_MIN", "LMVD7_POTASSIUM_MAX", "LMVD7_CHLORIDE_MIN",  "LMVD7_CHLORIDE_MAX",  "LMVD7_GLUCOSE_MIN", "LMVD7_GLUCOSE_MAX", "LMVD7_HEMATOCRIT_MIN",  "LMVD7_HEMATOCRIT_MAX",  "LMVD7_HEMOGLOBIN_MIN",  "LMVD7_HEMOGLOBIN_MAX", "LMVD7_BICARBONATE_MIN",  "LMVD7_BICARBONATE_MAX", "LMVD7_ANIONGAP_MIN",  "LMVD7_ANIONGAP_MAX",  "LMVD7_ALBUMIN_MIN", "LMVD7_ALBUMIN_MAX"]])
## Vital signs
df[["VMVD7_GLUCOSE_MIN", "VMVD7_GLUCOSE_MAX", "VMVD7_GLUCOSE_MEAN", "VMVD7_RESPRATE_MAX", "VMVD7_RESPRATE_MEAN", "VMVD7_TEMPC_MIN", "VMVD7_TEMPC_MAX", "VMVD7_TEMPC_MEAN", "VMVD7_HEARTRATE_MEAN", "VMVD7_SYSBP_MIN", "VMVD7_SYSBP_MAX", "VMVD7_SYSBP_MEAN", "VMVD7_DIASBP_MIN", "VMVD7_DIASBP_MAX", "VMVD7_DIASBP_MEAN", "VMVD7_MEANBP_MIN", "VMVD7_MEANBP_MAX", "VMVD7_MEANBP_MEAN", "VMVD7_RESPRATE_MIN", "VMVD7_HEARTRATE_MIN", "VMVD7_HEARTRATE_MAX"]] = preprocessing.scale(df[["VMVD7_GLUCOSE_MIN", "VMVD7_GLUCOSE_MAX", "VMVD7_GLUCOSE_MEAN", "VMVD7_RESPRATE_MAX", "VMVD7_RESPRATE_MEAN", "VMVD7_TEMPC_MIN", "VMVD7_TEMPC_MAX", "VMVD7_TEMPC_MEAN", "VMVD7_HEARTRATE_MEAN", "VMVD7_SYSBP_MIN", "VMVD7_SYSBP_MAX", "VMVD7_SYSBP_MEAN", "VMVD7_DIASBP_MIN", "VMVD7_DIASBP_MAX", "VMVD7_DIASBP_MEAN", "VMVD7_MEANBP_MIN", "VMVD7_MEANBP_MAX", "VMVD7_MEANBP_MEAN", "VMVD7_RESPRATE_MIN", "VMVD7_HEARTRATE_MIN", "VMVD7_HEARTRATE_MAX"]])
## Misc
df[["WEIGHTMVD7_WEIGHT_AVG", "BMIMVD7"]] = preprocessing.scale(df[["WEIGHTMVD7_WEIGHT_AVG", "BMIMVD7"]])

# Min Max Scaler
min_max_scaler = preprocessing.MinMaxScaler()
# Vital Signs 
## Day 1
df[["VD1_SPO2_MIN", "VD1_SPO2_MAX", "VD1_SPO2_MEAN", "VMVD1_SPO2_MIN", "VMVD1_SPO2_MAX", "VMVD1_SPO2_MEAN"]] = min_max_scaler.fit_transform(df[["VD1_SPO2_MIN", "VD1_SPO2_MAX", "VD1_SPO2_MEAN", "VMVD1_SPO2_MIN", "VMVD1_SPO2_MAX", "VMVD1_SPO2_MEAN"]])
## Day 7
df[["VMVD7_SPO2_MIN", "VMVD7_SPO2_MAX", "VMVD7_SPO2_MEAN"]] = min_max_scaler.fit_transform(df[["VMVD7_SPO2_MIN", "VMVD7_SPO2_MAX", "VMVD7_SPO2_MEAN"]])

# Lab Values
## Day 1
df[["LD1_BANDS_MIN", "LD1_BANDS_MAX", "LD1_BILIRUBIN_MIN", "LD1_BILIRUBIN_MAX", "LD1_CREATININE_MIN", "LD1_CREATININE_MAX", "LD1_LACTATE_MIN", "LD1_LACTATE_MAX", "LD1_PTT_MIN", "LD1_PTT_MAX", "LD1_INR_MIN", "LD1_INR_MAX", "LD1_PT_MIN", "LD1_PT_MAX", "LD1_BUN_MIN", "LD1_BUN_MAX", "LD1_WBC_MIN", "LD1_WBC_MAX"]] = min_max_scaler.fit_transform(df[["LD1_BANDS_MIN", "LD1_BANDS_MAX", "LD1_BILIRUBIN_MIN", "LD1_BILIRUBIN_MAX", "LD1_CREATININE_MIN", "LD1_CREATININE_MAX", "LD1_LACTATE_MIN", "LD1_LACTATE_MAX", "LD1_PTT_MIN", "LD1_PTT_MAX", "LD1_INR_MIN", "LD1_INR_MAX", "LD1_PT_MIN", "LD1_PT_MAX", "LD1_BUN_MIN", "LD1_BUN_MAX", "LD1_WBC_MIN", "LD1_WBC_MAX"]])
df[["LMVD1_BANDS_MIN", "LMVD1_BANDS_MAX", "LMVD1_BILIRUBIN_MIN", "LMVD1_BILIRUBIN_MAX", "LMVD1_CREATININE_MIN", "LMVD1_CREATININE_MAX", "LMVD1_LACTATE_MIN", "LMVD1_LACTATE_MAX", "LMVD1_PTT_MIN", "LMVD1_PTT_MAX", "LMVD1_INR_MIN", "LMVD1_INR_MAX", "LMVD1_PT_MIN", "LMVD1_PT_MAX", "LMVD1_BUN_MIN", "LMVD1_BUN_MAX", "LMVD1_WBC_MIN", "LMVD1_WBC_MAX"]] = min_max_scaler.fit_transform(df[["LMVD1_BANDS_MIN", "LMVD1_BANDS_MAX", "LMVD1_BILIRUBIN_MIN", "LMVD1_BILIRUBIN_MAX", "LMVD1_CREATININE_MIN", "LMVD1_CREATININE_MAX", "LMVD1_LACTATE_MIN", "LMVD1_LACTATE_MAX", "LMVD1_PTT_MIN", "LMVD1_PTT_MAX", "LMVD1_INR_MIN", "LMVD1_INR_MAX", "LMVD1_PT_MIN", "LMVD1_PT_MAX", "LMVD1_BUN_MIN", "LMVD1_BUN_MAX", "LMVD1_WBC_MIN", "LMVD1_WBC_MAX"]])
## Day 7
df[["LMVD7_BANDS_MIN", "LMVD7_BANDS_MAX", "LMVD7_BILIRUBIN_MIN", "LMVD7_BILIRUBIN_MAX", "LMVD7_CREATININE_MIN", "LMVD7_CREATININE_MAX", "LMVD7_LACTATE_MIN", "LMVD7_LACTATE_MAX", "LMVD7_PTT_MIN", "LMVD7_PTT_MAX", "LMVD7_INR_MIN", "LMVD7_INR_MAX", "LMVD7_PT_MIN", "LMVD7_PT_MAX", "LMVD7_BUN_MIN", "LMVD7_BUN_MAX", "LMVD7_WBC_MIN", "LMVD7_WBC_MAX"]] = min_max_scaler.fit_transform(df[["LMVD7_BANDS_MIN", "LMVD7_BANDS_MAX", "LMVD7_BILIRUBIN_MIN", "LMVD7_BILIRUBIN_MAX", "LMVD7_CREATININE_MIN", "LMVD7_CREATININE_MAX", "LMVD7_LACTATE_MIN", "LMVD7_LACTATE_MAX", "LMVD7_PTT_MIN", "LMVD7_PTT_MAX", "LMVD7_INR_MIN", "LMVD7_INR_MAX", "LMVD7_PT_MIN", "LMVD7_PT_MAX", "LMVD7_BUN_MIN", "LMVD7_BUN_MAX", "LMVD7_WBC_MIN", "LMVD7_WBC_MAX"]])

In [None]:
df.head()

In [None]:
# Assigning X and y split
y_3mo = df['MORTALITY_3MO_DISCH']
y_1yr = df['MORTALITY_1YR_DISCH']

## Remove outcomes from both
X = df.drop(columns=['MORTALITY_3MO_DISCH'])
X = X.drop(columns=['MORTALITY_1YR_DISCH'])

X.head()
print(df['MORTALITY_3MO_DISCH'].value_counts())
print(df['MORTALITY_1YR_DISCH'].value_counts())

## Training/Testing Split

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=4, test_size=0.2, random_state=1)

for train_index, test_index in sss.split(X, y_1yr):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_3mo_train, y_3mo_test = y_3mo.iloc[train_index], y_3mo.iloc[test_index]
    y_1yr_train, y_1yr_test = y_1yr.iloc[train_index], y_1yr.iloc[test_index]
    x_pv_train, x_pv_test = tmp.iloc[train_index], tmp.iloc[test_index]
    

### Save Resulting Splits

In [None]:
X_train.to_csv('./data/X_train.csv')
X_test.to_csv('./data/X_test.csv')

## Now that the ids are saved, drop ICUSTAY_ID, 'ICUSTAY_ID'
X_train = X_train.drop(columns=['ICUSTAY_ID'])
X_test = X_test.drop(columns = ['ICUSTAY_ID'])

np.save("./data/X_train_s.npy", X_train)
np.save("./data/X_test_s.npy", X_test)

np.save("./data/y_3mo_train_s.npy", y_3mo_train)
np.save("./data/y_3mo_test_s.npy", y_3mo_test)

np.save("./data/y_1yr_train_s.npy", y_1yr_train)
np.save("./data/y_1yr_test_s.npy", y_1yr_test)

## Data Preprocessing for ProVent Logistic Regression

In [None]:
## Remember to use the same split as the neural network!
## x_pv_train, x_pv_test

#Day 7
## df = pd.read_csv("./data/vent_trach_cohort_clean25May2020.csv")
df = 

In [None]:
## ProVent Variables
df = x_pv_train[['VPMVD7_VP', 'LMVD7_PLATELET_MIN', 'RRTMVD7_RRT', 'ADMISSION_AGE', 'FIRST_CAREUNIT', 'MORTALITY_3MO_DISCH', 'MORTALITY_1YR_DISCH']]

In [None]:
#df[["LMVD7_PLATELET_MIN", 'ADMISSION_AGE']] = preprocessing.scale(df[["LMVD7_PLATELET_MIN", 'ADMISSION_AGE']])
df['FIRST_CAREUNIT'] = (df['FIRST_CAREUNIT'] != 'TSICU').astype(int)


df['MORTALITY_3MO_DISCH'] = df['MORTALITY_3MO_DISCH'].astype(int)
df['MORTALITY_1YR_DISCH'] = df['MORTALITY_1YR_DISCH'].astype(int)

df.head()

In [None]:
# Assigning X and y split
#y = df['MORTALITY_3MO_DISCH']
y = df['MORTALITY_1YR_DISCH']

X = df.drop(columns=['MORTALITY_3MO_DISCH'])
X = X.drop(columns=['MORTALITY_1YR_DISCH'])

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, HTML


logreg = LogisticRegression()

predicted = cross_val_predict(logreg, X, y, cv=10)
predicted_proba = cross_val_predict(logreg, X, y, cv=10, method='predict_proba')

#y_prediction = cross_val_predict(logreg, X, y, cv=10, method='predict_proba')
#y_prediction_new = [el[1] for el in y_prediction]

#prediction = pd.DataFrame(y_prediction_new, columns=['pred_Y'])## .to_csv('prediction.csv')

#y.to_csv('actual.csv')
## prediction.head()