# Project 7 - Length of Stay Model

**Author: Linh Nguyen/Sam Holt**<br>

**StudentID: 23161711/23087175**<br>

**Date: Oct 2022**

## 1. Load data and import package

In [4]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Capstone/

Mounted at /content/drive
/content/drive/MyDrive/Capstone
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Capstone


In [None]:
! pip install -U tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow
  Downloading tensorflow-2.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (578.0 MB)
[K     |████████▋                       | 155.2 MB 88.0 MB/s eta 0:00:05

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
import re
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
# from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from tensorflow.keras import layers
import seaborn as sns
from matplotlib import pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from gensim.models import KeyedVectors
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix,ConfusionMatrixDisplay,classification_report, roc_auc_score
# tf.compat.v1.disable_v2_behavior()


In [None]:
keras.__version__

In [None]:
! ls

In [None]:
import os
os.listdir()

# 2. Data cleansing


In [None]:
edstays = pd.read_csv('edstays_updated.csv')
edstays['y_var_adm_text'] = edstays['disposition']
edstays['intime_h'] = pd.to_datetime(edstays.intime).dt.hour
edstays.loc[edstays['y_var_adm_text'] == 'TRANSFER', 'y_var_adm_text'] = 'ADMITTED'
edstays.loc[edstays['y_var_adm_text'] == 'ELOPED','y_var_adm_text'] = 'MISSING'
edstays.loc[edstays['y_var_adm_text'] == 'LEFT WITHOUT BEING SEEN','y_var_adm_text'] = 'MISSING'
edstays.loc[edstays['y_var_adm_text'] == 'OTHER','y_var_adm_text'] = 'MISSING'
edstays.loc[edstays['y_var_adm_text'] == 'LEFT AGAINST MEDICAL ADVICE','y_var_adm_text'] = 'MISSING'
edstays.loc[edstays['y_var_adm_text'] == 'EXPIRED','y_var_adm_text'] = 'MISSING'
edstays_admit = edstays[edstays['y_var_adm_text'] !='MISSING'].reset_index(drop = True)
edstays_admit['key'] = edstays_admit['subject_id'].astype(str)+'_'+edstays_admit['stay_id'].astype(str)
edstays_admit.columns

In [None]:
# edstays_admit.loc[edstays_admit['race'].str.contains('WHITE'), 'race_color'] = 'WHITE'
# edstays_admit.loc[edstays_admit['race'].str.contains('BLACK'), 'race_color'] = 'BLACK'
# edstays_admit.loc[edstays_admit['race'].str.contains('ASIAN'), 'race_color'] = 'ASIAN'
# edstays_admit.loc[edstays_admit['race'].str.contains('LATINO'), 'race_color'] = 'HISPANIC/LATINO'
# edstays_admit.loc[edstays_admit['race'].str.contains('HISPANIC'), 'race_color'] = 'HISPANIC/LATINO'
# edstays_admit.loc[edstays_admit['race_color'].isna(), 'race_color'] = 'OTHER'

In [None]:
edstays_admit['DateTime_in'] = pd.to_datetime(edstays_admit['intime'], format='%Y-%m-%d %H:%M:%S')
edstays_admit['DateTime_out'] = pd.to_datetime(edstays_admit['outtime'], format='%Y-%m-%d %H:%M:%S')
edstays_admit['stay'] = edstays_admit['DateTime_out'] - edstays_admit['DateTime_in']

In [None]:
edstays_admit_1 = edstays_admit.sort_values(by = ['subject_id','intime'], ascending=True).reset_index(drop=True)
edstays_admit_1['prior_visits'] = edstays_admit_1.groupby(['subject_id']).cumcount()
edstays_admit_1['in_date'] = pd.to_datetime(edstays_admit_1['in_date'])

edstays_admit_1 = edstays_admit_1.sort_values(by='in_date', ascending=True)
edstays_admit_1['daily_patients'] = edstays_admit_1.groupby(['in_date']).cumcount() + 1
edstays_admit_1[['subject_id','stay_id','in_date','prior_visits', 'daily_patients']].head(100)

In [None]:
edstays_admit_1 = edstays_admit_1.sort_values(by = ['subject_id','intime'], ascending=True).reset_index(drop=True)
edstay_hist = {}
for i in range(len(edstays_admit_1)):
    if i == 0:
        edstay_hist[i] = 'NO HISTORY'
    else:
        if edstays_admit_1['subject_id'][i] == edstays_admit_1['subject_id'][i-1]:
            edstay_hist[i] = edstays_admit_1['y_var_adm_text'][i-1]
        else:
            edstay_hist[i] = 'NO HISTORY'

In [None]:
edstay_laststay = {}
for i in range(len(edstays_admit_1)):
    if i == 0:
        edstay_laststay[i] = np.timedelta64(0, 'D')
    else:
        if edstays_admit_1['subject_id'][i] == edstays_admit_1['subject_id'][i-1]: 
          edstay_laststay[i] = edstays_admit_1['stay'][i-1]+edstay_laststay[i-1]
        else:
            edstay_laststay[i] = np.timedelta64(0, 'D')

In [None]:
df1 = pd.DataFrame.from_dict(edstay_hist,orient='index')
df1.columns = ['historical_stay_status']
edstays_admit_1 = pd.merge(edstays_admit_1,df1,how = 'left',left_index=True,right_index=True) 
edstays_admit_1.columns

In [None]:
df1 = pd.DataFrame.from_dict(edstay_laststay,orient='index')
df1.columns = ['historical_stay_length']
edstays_admit_1b = pd.merge(edstays_admit_1,df1,how = 'left',left_index=True,right_index=True) 
edstays_admit_1b.columns

In [None]:
edstays_admit_1b['historical_stay_length_in_day'] = edstays_admit_1b['historical_stay_length']/(np.timedelta64(1, 'D'))
edstays_admit_1b.columns

In [None]:
# edstays_admit_2 = pd.merge(edstays_admit_1,edstays_admit_1b[['historical_stay_length_in_day','subject_id','stay_id']],how = 'left',on=['subject_id', 'stay_id'])
edstays_admit_2 = edstays_admit_1b.copy()

In [None]:
edstays_admit_2.stay

In [None]:
triage = pd.read_csv('triage.csv')

def rescale_temp(x):
    if x > 200:
        return x/10
    else:
        return x

triage['temperature'] = [rescale_temp(x) for x in triage['temperature']]

def rescale_heart(x):
    if x > 300:
        return x / 100
    else:
        return x
    
triage['heartrate'] = [rescale_heart(x) for x in triage['heartrate']]

def rescale_pain(x):
    try:
        x = int(x)
        if x > 100:
            return 10
        elif x > 10:
            return x/10
        elif x < 0:
            return 0
        else:
            return x
    except:
        return None
    
triage['pain'] = [rescale_pain(x) for x in triage['pain']]
triage['pain'] = pd.to_numeric(triage['pain'])

In [None]:
edstay_ad = pd.merge(edstays_admit_2,triage,how = 'left',on=['subject_id', 'stay_id'])

In [None]:
edstay_ad = edstay_ad[edstay_ad['y_var_adm_text'].isin(['ADMITTED','HOME'])] 

In [None]:
edstay_ad['y_var'] = 0
edstay_ad.loc[edstay_ad['y_var_adm_text'] == 'ADMITTED', 'y_var']  = 1

In [None]:
edstay_ad.columns

In [None]:
edstay_ad

In [None]:
# we need to reintroduce sub/stay ID, get a copy of triage3 sub/stay id order, left join to triage3 (for order) and then remove
# all because the y labels are attached to triage3
edstay_ad1 = edstay_ad.drop(['Unnamed: 0','hadm_id','intime', 'intime_h','outtime','race', 'disposition','y_var_adm_text','key', 'DateTime_in', 'DateTime_out','stay','historical_stay_length','chiefcomplaint', 'y_var',
                             'subjects_entering', 'subjects_leaving', 'intime', 'in_date','out_date'], axis = 1)

In [None]:
edstay_ad1.info()

In [None]:
updated_edstay_ad = edstay_ad1
updated_edstay_ad['temperaturemissing'] = updated_edstay_ad['temperature'].isnull().astype(int)
updated_edstay_ad['heartratemissing'] = updated_edstay_ad['heartrate'].isnull().astype(int)
updated_edstay_ad['respratemissing'] = updated_edstay_ad['resprate'].isnull().astype(int)
updated_edstay_ad['o2satmissing'] = updated_edstay_ad['o2sat'].isnull().astype(int)
updated_edstay_ad['sbpmissing'] = updated_edstay_ad['sbp'].isnull().astype(int)
updated_edstay_ad['dbpmissing'] = updated_edstay_ad['dbp'].isnull().astype(int)
updated_edstay_ad['acuitymissing'] = updated_edstay_ad['acuity'].isnull().astype(int)

In [None]:
num_attr = edstay_ad1.select_dtypes(include=['float64', 'int64']).columns
cat_attr = edstay_ad1.select_dtypes(include=['O']).columns
#bool_attr = edstay_ad.select_dtypes(include=['bool']).columns

In [None]:
my_imputer = SimpleImputer(strategy = 'median')
updated_edstay_ad[num_attr] = my_imputer.fit_transform(updated_edstay_ad[num_attr])
updated_edstay_ad.info()

In [None]:
trans_pip = ColumnTransformer([
    ("num",StandardScaler(), num_attr),
    ("cat",OneHotEncoder(),cat_attr)
],remainder='passthrough')

In [None]:
triage_2 = edstay_ad[['subject_id','stay_id', 'y_var','chiefcomplaint']]

In [None]:
triage_2[triage_2.y_var.isna()]

In [None]:
triage_2[triage_2.chiefcomplaint.isna()]

In [None]:
triage_2.loc[triage_2['chiefcomplaint'].isna(),'chiefcomplaint'] = ''

In [None]:
triage_2[triage_2['chiefcomplaint'].isna()]

In [None]:
# Preprocess the job_description (remove the weird part of strings)
def preprocess_text(text):
    if type(text) == float:
        return text
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
triage_2['chiefcomplaint'] = triage_2.chiefcomplaint.apply(preprocess_text)
triage_2['chiefcomplaint']

In [None]:
df_pre_icd_full = pd.read_csv('previous_icd_code.csv')

triage_3 = pd.merge(triage_2,df_pre_icd_full, how = "left",on=['subject_id', 'stay_id'])

In [None]:
triage_3.head(10)

# 2. Text data overview

In [None]:
# 2. Split the dataset
# grab spine of triage to ensure subject_id/stay_id order match and redefine trans_pip

spine = triage_3[['subject_id','stay_id']]
ordered_edstays = pd.merge(spine, updated_edstay_ad, on=['subject_id','stay_id'],how='left')
ordered_edstays = ordered_edstays.drop(columns=['subject_id', 'stay_id'])


num_attr = ordered_edstays.select_dtypes(include=['float64', 'int64']).columns
cat_attr = ordered_edstays.select_dtypes(include=['O']).columns
trans_pip = ColumnTransformer([
    ("num", StandardScaler(), num_attr),
    ("cat", OneHotEncoder(), cat_attr)
],remainder='passthrough')

ordered_edstays.info()

In [None]:
#set random index, create train set and validation set from the original train set

#create array of random index
np.random.seed(123)
ind = np.arange(len(triage_3))
np.random.shuffle(ind)
#create testing index  = 0 to 0.20 * random index
test_index = ind[:int(len(triage_3) * 0.20)]
#create val index  = 0.20 to 0.30 * random index
val_index = ind[int(len(triage_3) * 0.20):int(len(triage_3) * 0.30)]
#create train index = 0.30 * random index to end
train_index = ind[int(len(triage_3) * 0.30):]

In [None]:
# Text data containing both current and previous cheifcomplains/icd_codes
train_samples_chief =  triage_3.chiefcomplaint[train_index]
train_samples_his =  triage_3.pre_icd_code[train_index]
val_samples_chief =  triage_3.chiefcomplaint[val_index]
val_samples_his =  triage_3.pre_icd_code[val_index]
test_samples_chief =  triage_3.chiefcomplaint[test_index]
test_samples_his =  triage_3.pre_icd_code[test_index]

# tabluar data - admission rates
train_samples_o =  ordered_edstays.iloc[train_index]
val_samples_o =  ordered_edstays.iloc[val_index]
test_samples_o =  ordered_edstays.iloc[test_index]

# truth variables
train_labels =  triage_3.y_var[train_index]
val_labels = triage_3.y_var[val_index]
test_labels = triage_3.y_var[test_index]

In [None]:
# clean up wide dataset
edstay_ad2 = edstay_ad.drop(['Unnamed: 0','hadm_id','intime', 'intime_h','outtime','race', 'y_var_adm_text','key', 'DateTime_in', 'DateTime_out','historical_stay_length','chiefcomplaint',
                             'subjects_entering', 'subjects_leaving', 'intime'], axis = 1)
from datetime import datetime

# create date diff 
edstay_ad2.in_date = pd.to_datetime(edstay_ad2.in_date)
edstay_ad2.out_date = pd.to_datetime(edstay_ad2.out_date)
diff = edstay_ad2.out_date - edstay_ad2.in_date
edstay_ad2['nights'] = [d.days for d in diff]

# function for creating Y
def reclass_nights(x):
  if x['nights'] > 7:
    return 'Beyond a Week'
  elif x['nights'] == 0: 
    return 'Within Today'
  else:
    return 'Within the Week'

# create length of stay classes
edstay_ad2['Y'] = edstay_ad2.apply(reclass_nights, axis=1)
edstay_ad2[['in_date','out_date','nights','Y']]
edstay_ad2['Y'] = pd.Categorical(edstay_ad2['Y'], categories = ['Beyond a Week', 'Within the Week', 'Within Today'])
edstay_ad2_filt = edstay_ad2[edstay_ad2.disposition.isin(['ADMITTED','TRANSFER'])]

# length of stay L O S
all_data_los = pd.merge(triage_3[['subject_id','stay_id','chiefcomplaint','pre_icd_code']], 
                        edstay_ad2_filt, on=['subject_id','stay_id'], how='inner')

# tabular cols to keep
keep_cols_los = ['prior_visits', 'daily_patients',
       'historical_stay_status', 'historical_stay_length_in_day',
       'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain',
       'acuity','gender','race_class','hour','work_hours','arrival_transport']

print(all_data_los.columns) # print out columns in all data


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils


y = all_data_los.Y
le =LabelEncoder()
le.fit(y)
y = le.transform(y)

X_train, X_test, y_train, y_test = train_test_split(all_data_los, y, test_size=0.3, random_state=42, stratify=y)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

# Text data containing both current and previous cheifcomplains/icd_codes
train_samples_chief =  X_train.chiefcomplaint
train_samples_his =  X_train.pre_icd_code
val_samples_chief =  X_val.chiefcomplaint
val_samples_his =  X_val.pre_icd_code
test_samples_chief =  X_test.chiefcomplaint
test_samples_his =  X_test.pre_icd_code

all_data_los_sub = all_data_los[keep_cols_los]

# tabluar data - length of stay
train_samples_o =  X_train[keep_cols_los]
val_samples_o =  X_val[keep_cols_los]
test_samples_o =  X_test[keep_cols_los]

# get weights for model compile
weights = {0:(sum(y_train==0)/len(y_train)),1:sum(y_train==1)/len(y_train),2:sum(y_train==2)/len(y_train)}

# truth variables
y_train = np_utils.to_categorical(y_train)
y_val = np_utils.to_categorical(y_val)
y_test = np_utils.to_categorical(y_test)


In [None]:
# Create col transform pipeline
num_attr = all_data_los_sub.select_dtypes(include=['float64', 'int64']).columns
cat_attr = all_data_los_sub.select_dtypes(include=['O']).columns
trans_pip = ColumnTransformer([
    ("num", StandardScaler(), num_attr),
    ("cat", OneHotEncoder(), cat_attr)
],remainder='passthrough')
trans_pip

# 3. Embedding for chief complaint and historical icd code

### 3.1 Embedding for chief complaint

In [None]:
vectorizer = TextVectorization(output_sequence_length=5)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples_chief).batch(128)
vectorizer.adapt(text_ds)

In [None]:
#print top 5 words
vectorizer.get_vocabulary()[:20]

In [None]:
#example of vectorizing a sentences:
output = vectorizer([["Abd pain, Wound eval"]])
output.numpy()[0, :6]

In [None]:
#build vocabulary dictionary for chief complaint:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [None]:
#load the model w2vec for chief complaint:
if 'model_w2v' not in locals():
  print('Loading model_w2v...')
  model_w2v = KeyedVectors.load_word2vec_format('BioWordVec_PubMed_MIMICIII_d200.vec.bin', binary=True)
  print('Done!')
else:
  print("model_w2v (word2vec PubMed domain) already loaded, ain't doin' that again!")

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 200
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    if word in model_w2v.vocab:
        embedding_vector = model_w2v.get_vector(word)
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        initializer = tf.keras.initializers.GlorotNormal(seed = 123)
        values = initializer(shape=(1, 200))
        embedding_matrix[i] = values
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))


In [None]:
#create embedding layers
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

### 3.2 Embedding for historical icd code group

In [None]:
his_icd_top198 = pd.read_csv('top198historical_icd.csv')

In [None]:
vectorizer_his = TextVectorization(output_sequence_length=5)
text_ds_his = tf.data.Dataset.from_tensor_slices(his_icd_top198['icd_group']).batch(128)
vectorizer_his.adapt(text_ds_his)

In [None]:
#top 5 icd:

vectorizer_his.get_vocabulary()[:5]

In [None]:
voc_his = vectorizer_his.get_vocabulary()
word_index_his = dict(zip(voc_his, range(len(voc_his))))
num_tokens_his = len(voc_his) + 2
embedding_dim_his = 200


# Prepare embedding matrix (create one hot encoding matrix for historical icd code)
embedding_matrix_his = np.zeros((num_tokens_his, embedding_dim_his))
for word, i in word_index_his.items():
    embedding_vector = np.zeros(embedding_dim_his)
    embedding_vector[vectorizer_his([word]).numpy()[0][0]] = 1
    embedding_matrix_his[i] = embedding_vector

In [None]:
embedding_layer_his = Embedding(
    num_tokens_his,
    embedding_dim_his,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix_his),
    trainable=False,
)

### 3.3 Concat 2 embedding

In [None]:
# tf.concat([embedding_layer(x_train[0]),[embedding_layer_his(x_train[0])]],0)

# 4 BiGRU model using historical and current data (historical icd code & chief complaint)

In [None]:
x_train = vectorizer(np.array([[s] for s in train_samples_chief])).numpy()
x_train_his = vectorizer_his(np.array([[s] for s in train_samples_his])).numpy()
x_train_o = trans_pip.fit_transform(train_samples_o)
y_train = np.array(train_labels)

x_val = vectorizer(np.array([[s] for s in val_samples_chief])).numpy()
x_val_his = vectorizer_his(np.array([[s] for s in val_samples_his])).numpy()
x_val_o = trans_pip.transform(val_samples_o)
y_val = np.array(val_labels)

In [None]:
x_train_o.shape

In [None]:
# !pip install numpy==1.19.5
# import numpy


In [None]:

#Optimized parameters for LSTM: {'units': 114, 'activation': 'tanh', 'optimizer': 'adam', 'rate': 0}
#Optimized parameters for MLP: {'units': 100/50/1, 'glorot_uniform', 0, 'relu'}

col1 = keras.Input(shape = (5,))
embedded_sequences1 = embedding_layer_his(col1)
col2 = keras.Input(shape = (5,))
embedded_sequences2 = embedding_layer(col2)
embedded_sequences = keras.layers.Concatenate(axis=1)([embedded_sequences1, embedded_sequences2])
inp3 = keras.Input(shape = (24,))

def create_mlp(dim):
  inp3 = keras.Input(shape = (dim,))
  x = keras.layers.Dense(100, kernel_initializer = 'he_normal', input_dim=dim, activation="relu")(inp3)
  x = keras.layers.Dense(64, kernel_initializer = 'he_normal', activation="relu")(x)
  #x = keras.layers.Dense(32, kernel_initializer = 'he_normal', activation="relu")(x)
  #x = keras.layers.Dense(1,  activation="sigmoid")(x)
  #preds = keras.layers.Dense(1,activation = 'sigmoid')(x)
  model = keras.models.Model(inputs = inp3, outputs = x)
  return model

def create_GRU():
  col1 = keras.Input(shape = (5,))
  col2 = keras.Input(shape = (5,))
  embedded_sequences1 = embedding_layer_his(col1)
  embedded_sequences2 = embedding_layer(col2)
  embedded_sequences = keras.layers.Concatenate(axis=1)([embedded_sequences1, embedded_sequences2])
  x = keras.layers.Bidirectional(keras.layers.GRU(114, return_sequences=True))(embedded_sequences)
  x = keras.layers.Bidirectional(keras.layers.GRU(114))(x)
  x = keras.layers.Dense(64,activation = 'relu')(x)
  #preds = keras.layers.Dense(1,activation = 'sigmoid')(x)
  model = keras.models.Model(inputs = [col1,col2], outputs = x)
  return model
  
tf.random.set_seed(42)

mlp = create_mlp(29)
gru = create_GRU()

In [None]:
combinedInput = keras.layers.concatenate([mlp.output, gru.output])
x = keras.layers.Dense(64)(combinedInput)
x = keras.layers.Dense(32)(x)
x = keras.layers.Dense(3, activation="sigmoid")(x)
model = keras.models.Model(inputs=[mlp.input, gru.input], outputs=x)

NameError: ignored

In [None]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = "accuracy",run_eagerly=True)
early = EarlyStopping(monitor='val_loss', patience=10)

# TO DO:
# Length of Stay 3 class (leave today, within the week, beyond the week) DONE
# align updated_edstay data DONE
# try xboost with last() vitalsigns
# then at LSTM vitalsigns time series

In [None]:
y_train

In [None]:
model.fit(x=[x_train_o,[x_train_his,x_train]], y = y_train, validation_data=([x_val_o,[x_val_his,x_val]], y_val), batch_size = 256, epochs=100, callbacks=[early],class_weight = weights)

In [None]:
model.save('/content/drive/MyDrive/Capstone_project/model_length_of_stay_01.h5')

In [None]:
from keras.utils.vis_utils import plot_model
#plot_model(model, to_file='model_plot_1.png', show_shapes=True, show_layer_names=True)
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
x_test = vectorizer(np.array([[s] for s in test_samples_chief])).numpy()
x_test_his = vectorizer_his(np.array([[s] for s in test_samples_his])).numpy()
x_test_o = trans_pip.transform(test_samples_o)
y_test = np.array(test_labels)

In [None]:
y_test

In [None]:
y_test_pred = model.predict(x=[x_test_o,[x_test_his, x_test]])

# print("roc auc for validation set:", roc_auc_score(y_test, y_test_pred))
# y_pred_test_class = np.where(y_test_pred> 0.5, 1, 0)
# print("accuracy score for test set:", accuracy_score(y_test, y_pred_test_class))
# print("f1 score for test set:", f1_score(y_test, y_pred_test_class))
# confusion_matrix(y_test, y_pred_test_class)

y_test_pred

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred_test_class), display_labels = ["Home","Admitted"])
disp.plot()
plt.show()

In [None]:
loaded_1 = keras.models.load_model("/content/drive/MyDrive/Capstone_project/model_2b.h5")


In [None]:
weights = {0:1-(sum(y_train)/len(y_train)),1:sum(y_train)/len(y_train)}

In [None]:
y_test_pred = loaded_1.predict(x=[x_test_o,x_test_his, x_test])

print("roc auc for validation set:", roc_auc_score(y_test, y_test_pred))
y_pred_test_class = np.where(y_test_pred> 0.5, 1, 0)
print("accuracy score for test set:", accuracy_score(y_test, y_pred_test_class))
print("f1 score for test set:", f1_score(y_test, y_pred_test_class))
confusion_matrix(y_test, y_pred_test_class)


In [None]:
y_test_pred = loaded_1.predict(x=[x_test_o,x_test_his, x_test])

print("roc auc for validation set:", roc_auc_score(y_test, y_test_pred))
y_pred_test_class = np.where(y_test_pred> weights[1], 1, 0)
print("accuracy score for test set:", accuracy_score(y_test, y_pred_test_class))
print("f1 score for test set:", f1_score(y_test, y_pred_test_class))
confusion_matrix(y_test, y_pred_test_class)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred_test_class), display_labels = ["Home","Admitted"])
disp.plot()
plt.show()

In [None]:
pd.DataFrame(x_test_o).to_csv("x_test_o.csv",index=False)
pd.DataFrame(x_test).to_csv("x_test.csv",index=False)
pd.DataFrame(x_test_his).to_csv("x_test_his.csv",index=False)
pd.DataFrame(train_samples_o).to_csv("train_samples_o.csv",index=False)
pd.DataFrame(test_samples_his).to_csv("test_samples_his.csv",index=False)
pd.DataFrame(test_samples_chief).to_csv("test_samples_chief.csv",index=False)
pd.DataFrame(edstay_ad1).to_csv("edstay_ad1.csv",index=False)
pd.DataFrame(updated_edstay_ad).to_csv("updated_edstay_ad.csv",index=False)
pd.DataFrame(triage_3).to_csv("updated_edstay_ad.csv",index=False)

In [None]:
!pip install shap

In [None]:
import shap
import tensorflow.compat.v1.keras.backend as K
import tensorflow as tf
tf.compat.v1.disable_eager_execution()

In [None]:
explainer = shap.DeepExplainer(model, [x_test_o[:1000],x_test_his[:1000],x_test[:1000]])

In [None]:
shap.initjs()

shap_values = explainer.shap_values([x_test_o[:500],x_test_his[:500],x_test[:500]])

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
# lazy XGBoost for admitted
import xgboost as xgb

In [None]:
xgb_model = xgb.XGBClassifier()

In [None]:
xgb_model.fit(x_train_o, train_labels)

In [None]:
edstay_ad1 = edstay_ad.drop(['Unnamed: 0','hadm_id','intime', 'intime_h','outtime','race', 'disposition','y_var_adm_text','key', 'DateTime_in', 'DateTime_out','stay','historical_stay_length','chiefcomplaint',
                             'subjects_entering', 'subjects_leaving', 'intime', 'in_date','out_date'], axis = 1)

In [None]:
edstay_ad1['gender'] = pd.Categorical(edstay_ad1['gender'])
edstay_ad1['arrival_transport'] = pd.Categorical(edstay_ad1['arrival_transport'])
edstay_ad1['race_class'] = pd.Categorical(edstay_ad1['race_class'])
edstay_ad1['historical_stay_status'] = pd.Categorical(edstay_ad1['historical_stay_status'])
edstay_ad1.info()

In [None]:
y_labels = edstay_ad1['y_var']

X = edstay_ad1.drop(['subject_id','stay_id','y_var'], axis=1)
X = pd.get_dummies(X)

import numpy as np

for col in X.columns:
  val = np.mean(X[col])
  X[col] = X[col].fillna(val)

X_train, X_test, y_train, y_test  = train_test_split(X, y_labels, test_size=0.7, stratify=y_labels)
X.info()

In [None]:
from sklearn.metrics import accuracy_score,f1_score

from sklearn.ensemble import RandomForestClassifier

# Init classifier
xgb_cl = xgb.XGBClassifier()
# ran = RandomForestClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)

# Score
print('Acc:',accuracy_score(y_test, preds))
print('F1:',f1_score(y_test, preds))

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix(preds, y_test), display_labels = ["Home","Admitted"])
disp.plot()
plt.show()

In [None]:
explainer = shap.Explainer(xgb_cl)

In [None]:
shap_values = explainer(X_train)

In [None]:
shap.plots.beeswarm(shap_values, max_display=20)

In [None]:
shap.plots.bar(shap_values, max_display=20)

In [None]:
# lazy xgb for length of stay 3 class
edstay_ad2 = edstay_ad.drop(['Unnamed: 0','hadm_id','intime', 'intime_h','outtime','race', 'y_var_adm_text','key', 'DateTime_in', 'DateTime_out','historical_stay_length','chiefcomplaint',
                             'subjects_entering', 'subjects_leaving', 'intime'], axis = 1)
from datetime import datetime

edstay_ad2.in_date = pd.to_datetime(edstay_ad2.in_date)
edstay_ad2.out_date = pd.to_datetime(edstay_ad2.out_date)
diff = edstay_ad2.out_date - edstay_ad2.in_date
edstay_ad2['nights'] = [d.days for d in diff]

def reclass_nights(x):
  if x['nights'] > 7:
    return 'Beyond a Week'
  elif x['nights'] == 0: 
    return 'Within Today'
  else:
    return 'Within the Week'

edstay_ad2['Y'] = edstay_ad2.apply(reclass_nights, axis=1)
edstay_ad2[['in_date','out_date','nights','Y']]
edstay_ad2['Y'] = pd.Categorical(edstay_ad2['Y'], categories = ['Beyond a Week', 'Within the Week', 'Within Today'])
edstay_ad2_filt = edstay_ad2[edstay_ad2.disposition.isin(['ADMITTED','TRANSFER'])]

In [None]:

# edstay_ad2_filt.info()
edstay_ad2_filt.Y.value_counts()

In [None]:
y_labels = edstay_ad2_filt['Y']

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(y_labels)
y_labels = le.transform(y_labels)

X = edstay_ad2_filt.drop(['subject_id','stay_id','y_var', 'disposition', 'Y','y_var','nights','in_date','out_date','stay'], axis=1)
X = pd.get_dummies(X)

import numpy as np

for col in X.columns:
  val = np.mean(X[col])
  X[col] = X[col].fillna(val)

X_train, X_test, y_train, y_test  = train_test_split(X, y_labels, test_size=0.7, stratify=y_labels)

from sklearn.metrics import accuracy_score,f1_score

from sklearn.ensemble import RandomForestClassifier

# Init classifier
xgb_cl = xgb.XGBClassifier()
# ran = RandomForestClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)

# Score
print('Acc:',accuracy_score(y_test, preds))
print('F1:',f1_score(y_test, preds, average = 'weighted'))

disp = ConfusionMatrixDisplay(confusion_matrix(preds, y_test))
disp.plot()
plt.show()

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix(preds, y_test), display_labels=['Beyond a Week', 'Within Today','Within the Week'])
disp.plot()
plt.show()

In [None]:
decoded = le.inverse_transform(preds)
print(decoded[:10])
print(preds[:10])

In [None]:
shap_values = shap.TreeExplainer(xgb_cl).shap_values(X_test)
shap.summary_plot(shap_values, X_test)