In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import StratifiedKFold
%matplotlib inline

In [None]:
'''
    Helper Function's
'''
def get_roc_auc_score(Model, X, y):
    predict = Model.predict(X)
    predict = list(predict.argmax(1))
    return roc_auc_score(y, predict)

def get_missing_vitals_df(df):
    vital = [v.split('d1_')[1].split('_max')[0] for v in df[(df.str.startswith('d1')&df.columns.str.endswith('max'))]]
    vitals = []
    vitals_h1_max = []
    vitals_h1_min = []
    vitals_d1_max = []
    vitals_d1_min = []
    for v in vital:
        vitals.append(v)
        vitals_h1_max.append(f'h1_{v}_max')
        vitals_h1_min.append(f'h1_{v}_min')
        vitals_d1_max.append(f'd1_{v}_max')
        vitals_d1_min.append(f'd1_{v}_min')

    temp = pd.DataFrame({
        'vitals' : vitals,
        'vitals_d1_max' : df[vitals_d1_max].isna().sum().values,
        'vitals_d1_min' : df[vitals_d1_min].isna().sum().values,
        'vitals_h1_max' : df[vitals_h1_max].isna().sum().values,
        'vitals_h1_min' : df[vitals_h1_min].isna().sum().values}
    ).set_index('vitals')
    return temp

def gelu(x):
    return 0.5*x*(1 + tf.tanh(tf.sqrt(2 / np.pi)*(x + 0.044715 * tf.pow(x,3))))

def clean_data(df):
    df = df.drop(columns = ['Unnamed: 0','encounter_id','hospital_id','icu_id'])
    col_cat = []
    for col in df.columns:
        if(df.dtypes[col]=='object'):
            df[col] = df[col].fillna('Na')
            df[col] = df[col].astype('str')
            col_cat.append(col)
        elif(Column_Dt_Mapping[col]=='binary'):
            df[col] = df[col].fillna('2')
            df[col] = df[col].astype('str')
            col_cat.append(col)
    for col in ['apache_3j_diagnosis','apache_2_diagnosis']:
        col_cat.append(col)
        df[col] = df[col].fillna('Na')
        df[col] = df[col].astype('str')
    
    for col in ['height', 'weight', 'bmi']:
        df[col] = np.where((df[col].isna() & (df.gender == 'M')),df[df['gender']=='M'][col].mean(),df[col])
        df[col] = np.where((df[col].isna() & (df.gender == 'F')),df[df['gender']=='F'][col].mean(),df[col])
        df[col] = np.where((df[col].isna() & (df.gender == 'Na')),df[df['gender']=='Na'][col].mean(),df[col])

    invasive_min_max = [v.split('h1_')[1].split('_invasive_min')[0] for v in df.columns[(df.columns.str.startswith('h1_') & df.columns.str.endswith('_invasive_min'))]]

    for col in invasive_min_max:
        df[f'h1_{col}_max'].fillna(df[f'h1_{col}_max'].median(),inplace=True)
        df[f'h1_{col}_min'].fillna(df[f'h1_{col}_min'].median(),inplace=True)
        df[f'd1_{col}_max'].fillna(df[f'd1_{col}_max'].median(),inplace=True)
        df[f'd1_{col}_min'].fillna(df[f'd1_{col}_min'].median(),inplace=True)
        df[f'h1_{col}_invasive_max'] = np.where(df[f'h1_{col}_invasive_max'].isna(),df[f'h1_{col}_max'],df[f'h1_{col}_max'])
        df[f'h1_{col}_invasive_min'] = np.where(df[f'h1_{col}_invasive_min'].isna(),df[f'h1_{col}_min'],df[f'h1_{col}_min'])
        df[f'h1_{col}_noninvasive_max'] = np.where(df[f'h1_{col}_noninvasive_max'].isna(),df[f'h1_{col}_max'],df[f'h1_{col}_max'])
        df[f'h1_{col}_noninvasive_min'] = np.where(df[f'h1_{col}_noninvasive_min'].isna(),df[f'h1_{col}_min'],df[f'h1_{col}_min'])
        df[f'd1_{col}_invasive_max'] = np.where(df[f'd1_{col}_invasive_max'].isna(),df[f'd1_{col}_max'],df[f'd1_{col}_max'])
        df[f'd1_{col}_invasive_min'] = np.where(df[f'd1_{col}_invasive_min'].isna(),df[f'd1_{col}_min'],df[f'd1_{col}_min'])
        df[f'd1_{col}_noninvasive_max'] = np.where(df[f'd1_{col}_noninvasive_max'].isna(),df[f'd1_{col}_max'],df[f'd1_{col}_max'])
        df[f'd1_{col}_noninvasive_min'] = np.where(df[f'd1_{col}_noninvasive_min'].isna(),df[f'd1_{col}_min'],df[f'd1_{col}_min'])

    for col in ['albumin','bilirubin','creatinine','glucose','hematocrit','resprate','sodium','temp','wbc','bun']:
        df[f'd1_{col}_max'].fillna(df[f'd1_{col}_max'].median(),inplace=True)
        df[f'd1_{col}_min'].fillna(df[f'd1_{col}_min'].median(),inplace=True)
        df[f'h1_{col}_max'] = np.where(df[f'h1_{col}_max'].isna(),df[f'd1_{col}_max'],df[f'h1_{col}_max'])
        df[f'h1_{col}_min'] = np.where(df[f'h1_{col}_min'].isna(),df[f'd1_{col}_min'],df[f'h1_{col}_min'])
        df[f'{col}_apache'] = np.where(df[f'{col}_apache'].isna(),df[f'd1_{col}_max'],df[f'{col}_apache'])

    for col in ['spo2','calcium','hco3','hemaglobin','inr','lactate','platelets','potassium','arterial_pco2','arterial_ph','arterial_po2','heartrate','pao2fio2ratio']:
        df[f'd1_{col}_max'].fillna(df[f'd1_{col}_max'].median(),inplace=True)
        df[f'd1_{col}_min'].fillna(df[f'd1_{col}_min'].median(),inplace=True)
        df[f'h1_{col}_max'] = np.where(df[f'h1_{col}_max'].isna(),df[f'd1_{col}_max'],df[f'h1_{col}_max'])
        df[f'h1_{col}_min'] = np.where(df[f'h1_{col}_min'].isna(),df[f'd1_{col}_min'],df[f'h1_{col}_min'])    

    for col in ['fio2','gcs_eyes','gcs_motor','gcs_verbal','heart_rate','map','urineoutput','heart_rate','paco2','paco2_for_ph','pao2','ph']:
        df[f'{col}_apache'].fillna(df[f'{col}_apache'].median(),inplace=True)

    return col_cat, df

In [None]:
Base_dir = '../input/widsdatathon2021/'
train = pd.read_csv(os.path.join(Base_dir+'TrainingWiDS2021.csv'))
test = pd.read_csv(os.path.join(Base_dir+'UnlabeledWiDS2021.csv'))
DataDictionary = pd.read_csv(os.path.join(Base_dir,'DataDictionaryWiDS2021.csv'))

## Imputing Missing Values

In [None]:
test_clean = test.copy()
train_clean = train.copy()

Column_Dt_Mapping = dict(zip(DataDictionary['Variable Name'],DataDictionary['Data Type']))

test_clean.at[8360,'apache_3j_diagnosis'] = np.nan   
test_clean.at[8494,'apache_3j_diagnosis'] = np.nan   

train_clean['age'] = np.where(train_clean['age'].isna(),train_clean['age'].median(),train_clean['age'])
test_clean['age'] = test_clean['age'].astype('float64')

_, train_clean = clean_data(train_clean)
col_cat, test_clean = clean_data(test_clean)

for col in col_cat:
    le = LabelEncoder().fit(train_clean[col])
    train_clean[col] = le.transform(train_clean[col])
    test_clean[col] = le.transform(test_clean[col])

In [None]:
missing_columns_tr = {k: v for k, v in train_clean.isna().sum().items() if v}
missing_columns_te = {k: v for k, v in test_clean.isna().sum().items() if v}
print(f'No. of columns having missing items in training columns: {len(missing_columns_tr)}, in test columns: {len(missing_columns_te)}')

In [None]:
#vt = VarianceThreshold(threshold=0.001)
#vt.fit_transform(train_clean).shape

In [None]:
for col in train_clean.columns:
    if((train_clean[col].nunique() == 1)and(train_clean[col].dtype!='float64')):
        print(col)

In [None]:
train_clean = train_clean.drop('readmission_status',1)
test_clean = test_clean.drop('readmission_status',1)
col_cat.remove('readmission_status')
col_cont = [col for col in train_clean.columns if col not in col_cat]
col_cont.remove('diabetes_mellitus')

In [None]:
#scaler = StandardScaler()
#X[col_cont] = scaler.fit_transform(X[col_cont])
#X_valid[col_cont] = scaler.fit_transform(X_valid[col_cont])
#test_data = test_clean.copy()
#test_data[col_cont] = scaler.transform(test_data[col_cont])


#X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42, stratify = y)

#scaler = StandardScaler()
#X_train[col_cont] = scaler.fit_transform(X_train[col_cont])
#X_valid[col_cont] = scaler.fit_transform(X_valid[col_cont])
#test_data = test_clean.copy()
#test_data[col_cont] = scaler.transform(test_data[col_cont])

#X_train = [np.absolute(X_train.loc[:,f]) for f in col_cat]+[X_train[col_cont]]
#X_valid = [np.absolute(X_valid.loc[:,f]) for f in col_cat]+[X_valid[col_cont]]
#test_data = [np.absolute(test_data.loc[:,f]) for f in col_cat]+[test_data[col_cont]]

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Model, load_model
from keras.layers import Dropout, Dense, BatchNormalization, Embedding, Input, Concatenate, SpatialDropout1D, Reshape, Flatten, concatenate, Activation, LeakyReLU
from keras.metrics import AUC
from tensorflow.keras import utils
from tensorflow.keras.utils import get_custom_objects
from keras.optimizers import Adam, SGD
from keras.losses import binary_crossentropy
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
get_custom_objects().update({'gelu': Activation(gelu)})
get_custom_objects().update({'leaky-relu': Activation(LeakyReLU(alpha=0.2))})



CFG = {
    'feature_selection_dropout' : 0.2,
    'categorical_dropout' : 0.1,
    'first_dense' : 256,
    'second_dense' : 256,
    'dense_dropout' : 0.2,
    'activation_type' : 'leaky-relu',
    'activation' : 'sigmoid',
    'epochs' : 200,
    'loss' : 'binary_crossentropy',
    'optimizer' : Adam(learning_rate=0.001),
    'mon_metrics' : 'val_auc',
    'num_folds' : 5
      }

In [None]:
def dnn_embedding(feature_selection_dropout = CFG['feature_selection_dropout'], categorical_dropout = CFG['categorical_dropout'],
                first_dense = CFG['first_dense'], second_dense = CFG['second_dense'], dense_dropout = CFG['dense_dropout'], 
                activation_type = CFG['activation_type'], activation = CFG['activation']):

    inputs = []
    embeddings = []

    for category in  col_cat:
        categorical_inputs = Input(shape=[1], name=category)
        num_unique_vals = int(train_clean[category].nunique())
        embed_dim = int(min(np.ceil(num_unique_vals / 2), 100)) 
        categorical_outputs = Embedding(num_unique_vals+1, 
                      embed_dim, 
                      name = category + "_embed")(categorical_inputs)
        categorical_outputs = SpatialDropout1D(categorical_dropout)(categorical_outputs)
        categorical_outputs = Reshape(target_shape=(embed_dim,))(categorical_outputs)
        inputs.append(categorical_inputs)
        embeddings.append(categorical_outputs)
        
        
    numerical_inputs = Input(shape=(len(col_cont),))
    numerical_normalization = BatchNormalization()(numerical_inputs)
    #numerical_feature_selection = Dropout(feature_selection_dropout)(numerical_normalization)
    inputs.append(numerical_inputs)
    embeddings.append(numerical_normalization)    
    
    x = concatenate(embeddings)
    x = Dense(first_dense, activation=activation_type)(x)
    #x = BatchNormalization()(x)
    x = Dropout(dense_dropout)(x)  
    x = Dense(second_dense, activation=activation_type)(x)
    x = BatchNormalization()(x)
    x = Dropout(dense_dropout)(x)
    x = Dense(second_dense, activation=activation_type)(x)
    #x = BatchNormalization()(x)
    x = Dropout(dense_dropout)(x)
    x = Dense(second_dense, activation=activation_type)(x)
    #x = BatchNormalization()(x)
    x = Dropout(dense_dropout)(x)
    y = Dense(1, activation=activation)(x)
    model = Model(inputs = inputs , outputs = y)
    
    return model 

In [None]:
dnn_embedding().summary()

In [None]:
oof_preds = np.zeros((len(train)))
test_preds = np.zeros((len(test)))

y = train_clean['diabetes_mellitus']
y = y.astype(int)
#train_clean = train_clean.drop('diabetes_mellitus',1)
X = train_clean.drop('diabetes_mellitus',1)

skf = StratifiedKFold(n_splits=CFG['num_folds'], shuffle = True, random_state=42)

for folds, (tdx, vdx) in enumerate(skf.split(X, y.values)):
    X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx]

    scaler = StandardScaler()
    X_train[col_cont] = scaler.fit_transform(X_train[col_cont])
    X_valid[col_cont] = scaler.transform(X_valid[col_cont])
    test_data = test_clean.copy()
    test_data[col_cont] = scaler.transform(test_data[col_cont])
    
    test_data = [np.absolute(test_data[col]) for col in col_cat] + [test_data[col_cont]]
    X_train = [np.absolute(X_train[col]) for col in col_cat] + [X_train[col_cont]]
    X_valid = [np.absolute(X_valid[col]) for col in col_cat] + [X_valid[col_cont]]
    
    model = dnn_embedding()


    model.compile(loss = CFG['loss'], optimizer = CFG['optimizer'], metrics = ['accuracy','AUC'])

    es = EarlyStopping(monitor = CFG['mon_metrics'], min_delta = 0.001, patience = 20,
                    verbose = 1, mode = 'max', baseline = None, restore_best_weights = True)

    rlr = ReduceLROnPlateau(monitor=CFG['mon_metrics'], factor = 0.5, patience=5, mode='max', verbose=1, min_lr = 1e-6)
    
                                        
    model.fit(X_train,
              y_train,
              validation_data = (X_valid, y_valid),
              verbose = 1,
              batch_size = 5024,
              callbacks=[es],
              epochs=CFG['epochs']
             )

    valid_fold_preds = model.predict(X_valid)
    test_fold_preds = model.predict(test_data)
    oof_preds[vdx] = valid_fold_preds.ravel()
    test_preds += test_fold_preds.ravel()
    print(f'Fold: {str(folds)}, AUC: {roc_auc_score(y_valid, valid_fold_preds)}')
    #K.clear_session()

In [None]:
AUC_FINAL = roc_auc_score(y.values, oof_preds)
test_predictions = test_preds/(folds+1)
print(f'Overall AUC ROC: {AUC_FINAL}')

In [None]:
Unlabeled1 = pd.read_csv(os.path.join(Base_dir,'UnlabeledWiDS2021.csv'))
submit = Unlabeled1[['encounter_id']]
submit['diabetes_mellitus'] = test_predictions
submit.to_csv('ann_embeded_complete_epoch_02_00.csv',index=False)
submit.head()