In [35]:
import pandas as pd
import numpy as np
import tensorflow as tf
import warnings
warnings.simplefilter("ignore")
from sklearn.utils import resample

url_1 = 'https://raw.githubusercontent.com/takanju/wids_datathon_2021/master/TrainingWiDS2021.csv'
train_local_path = '../data/TrainingWiDS2021.csv'
url_2 = 'https://raw.githubusercontent.com/takanju/wids_datathon_2021/master/UnlabeledWiDS2021.csv'
test_local_path = '../data/UnlabeledWiDS2021.csv'

In [36]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Disable GPU
tf.config.list_physical_devices('GPU')

# Simpler model = CPU faster than GPU

[]

In [37]:
# Splitted data to make same shape of both test and train so that we can apply pre processing on both
# Ref : https://www.kaggle.com/siavrez/2020fatures
medical_data = pd.read_csv(train_local_path, error_bad_lines=False, index_col=0)
test_df = pd.read_csv(test_local_path, error_bad_lines=False, index_col=0)
y = medical_data["diabetes_mellitus"]
medical_data = medical_data.drop(["diabetes_mellitus"], axis=1)

In [38]:
medical_data.shape

(130157, 179)

In [39]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

def preProcessing1(df: pd.DataFrame, y:pd.Series) -> pd.DataFrame:
  columns=df.columns

  #dropping column which are either irrelevant or around 50% of missing values
  df=df.drop(['encounter_id', 'hospital_id', 'icu_id', 'albumin_apache','bilirubin_apache','fio2_apache','paco2_apache','paco2_for_ph_apache','pao2_apache','ph_apache', 'urineoutput_apache'], axis=1)

  #separating categorical & numerical features
  df_cat = df.select_dtypes("object")
  df_num = df.select_dtypes("number")
  cat_cal = df_cat.columns
  num_cal = df_num.columns
  
  #imputing categorical features by mode
  impute_size1=SimpleImputer(strategy="most_frequent") 
  df_cat = impute_size1.fit_transform(df_cat)
  df_cat = pd.DataFrame(df_cat, columns = cat_cal)

  #imputing numerical features by mean
  impute_size2=SimpleImputer(missing_values=np.nan, strategy="mean") 
  df_num = impute_size2.fit_transform(df_num)

  df_num = pd.DataFrame(df_num, columns = num_cal)
  #concatenating both categorical & numerical features
#   df = df_cat.join(df_num)
#   return df, [impute_size1, impute_size2 ]
  
  #Standardization
  scaler = StandardScaler()
  scaled_df = scaler.fit_transform(df_num)
  df_num_scale = pd.DataFrame(data=scaled_df, columns=df_num.columns)

  #One Hot Encoding
  df_cat = pd.get_dummies(df_cat)
  df = df_cat.join(df_num_scale)

#   return df, [impute_size1, impute_size2, scaler, {"OHE":(cat_cal, df_cat_encod, onehotencoder)}]
  return df, [impute_size1, impute_size2, scaler, list(cat_cal)]



In [40]:
#!pip install tqdm
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from scipy import stats, special
import pandas as pd
import numpy as np
import os
import warnings
warnings.simplefilter("ignore")

# Ref: https://www.kaggle.com/shashankasubrahmanya/missing-data-imputation-using-regression

# filling missing values based on linear regression and the most correlated variables
# linear regression -> F-test (which columns are mostly related with the given target column, other than "diabetes")
# Multi-colinearity
# target = diabetes
# Remove features which has more than 50% percentage of missing values
# Return reduced dataset

# Linear Regression
def fillna_using_linear_model(df: pd.DataFrame):
    fea_cols=[]
    for col in df.columns:
        if df[col].dtype=='float64':
            fea_cols.append(col)

    correl = df[fea_cols].corr()

    for col in tqdm(fea_cols):
        nan_ratio = df[col].isnull().sum() / df.shape[0]
        if nan_ratio > 0:
            best_nan_ratio = nan_ratio
            best_col = None
            for id in correl.loc[(correl[col] > 0.7) | (correl[col] < -0.7), col].index:
                nan_temp_ratio = df[id].isnull().sum() / df.shape[0]
                if best_nan_ratio > nan_temp_ratio:
                    best_nan_ratio = nan_temp_ratio
                    best_col = id
            if best_col != None:
                sub = df[[col, best_col]].copy()
                sub = sub.dropna()
                reg = LinearRegression(fit_intercept=True).fit(np.expand_dims(sub[best_col], axis=1), sub[col])
                print(reg.score(np.expand_dims(sub[best_col], axis=1), sub[col]))
                if reg.score(np.expand_dims(sub[best_col], axis=1), sub[col])>0.7:
                    if df.loc[(~df[best_col].isnull()) & (df[col].isnull()), col].shape[0] > 0:
                        df.loc[(~df[best_col].isnull()) & (df[col].isnull()), col] = \
                        reg.predict(np.expand_dims(df.loc[(~df[best_col].isnull()) & (df[col].isnull()), best_col], axis=1))

    return df


def preProcessing2(df: pd.DataFrame, y:pd.Series) -> pd.DataFrame:
  columns=df.columns

# Replace values such as +,- ininity with nan
  df = df.replace([np.inf, -np.inf], np.nan)

# Linear regression
  linReg = fillna_using_linear_model(df)

# Simple Imputing
  imputer = SimpleImputer(strategy="mean")
  imputed_df = pd.DataFrame(imputer.fit_transform(linReg.values))

# Standardization
  scaler = StandardScaler()
  scaled_df = scaler.fit_transform(imputed_df)
  df= pd.DataFrame(data=scaled_df, columns=columns)
  
  return df, [imputer, scaler]

# dd = preProcessing2(medical_data.copy(), y)
# dd


In [41]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer # MICE 
from sklearn.preprocessing import StandardScaler
# from discretization import MDLP
# from mdlp.discretization import MDLP

def preProcessing3(df: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
  columns = df.columns

  # Log transform for skewing data
  # https://stats.stackexchange.com/questions/267078/why-is-skewed-data-not-preferred-for-modelling
  # https://stats.stackexchange.com/questions/299154/the-benefit-of-unskewing-skewed-data
  logs_transform_list = ['d1_bilirubin_min', 'd1_bilirubin_max', 'd1_glucose_max', 'h1_bilirubin_max', 'h1_bilirubin_min', 'h1_bun_max', 'h1_bun_min']
  df[logs_transform_list] = np.log2(df[logs_transform_list])

  # Simple Imputing
  imputer = SimpleImputer(strategy="mean")
  imputed_df = imputer.fit_transform(df.values)

  # Standardization
  scaler = StandardScaler()
  imputed_scaled_df = scaler.fit_transform(imputed_df)

  df = pd.DataFrame(columns=columns, data=imputed_scaled_df)

  return df, [{"columns": logs_transform_list, "transform": np.log2}, imputer, scaler]

In [42]:
#ref: https://www.kaggle.com/lhagiimn/7th-place-solution-wids-2021
#ref:https://www.kaggle.com/letianyu/wids-2021-notebook

def remove_NaN_Values(df, threshold):
    # store the name of columns
    # drop the same for the test set (later)
    NaN_cols = []
    for col in df.columns:
        NaN_ratio = df[col].isnull().sum() / df.shape[0]
        if NaN_ratio >= threshold:
            NaN_cols.append(col)
    df = df.drop(NaN_cols, axis=1)
    return df,NaN_cols

def preProcessing4(df: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
    columns = df.columns

    Removed_NaN_df,NaN_cols = remove_NaN_Values(df,0.5)

    return Removed_NaN_df, NaN_cols


# Combine all pre processings 

In [43]:
df1 = medical_data.iloc[:, :45]
prep_df1, tf1 = preProcessing1(df1, y)
prep_df1.index = medical_data.index

df2 = medical_data.iloc[:, 45:90]
prep_df2, tf2 = preProcessing2(df2, y)
prep_df2.index = medical_data.index

df3 = medical_data.iloc[:, 90:135]
prep_df3, tf3 = preProcessing3(df3, y)
prep_df3.index = medical_data.index

df4 = medical_data.iloc[:, 135:]
prep_df4, tf4 = preProcessing4(df4, y)
prep_df4.index = prep_df1.index

prep_df = pd.concat([prep_df1, prep_df2, prep_df3, prep_df4], axis=1)

print(f'Percent of Nans: {round(prep_df.copy().isna().sum().sum()/len(prep_df.copy()), 2)}')

 29%|██▉       | 13/45 [00:00<00:00, 121.93it/s]

0.563520769020351
0.9933743343176455
0.9932092949406127
0.5556868832041206
0.6232299891476356
0.5630410484129751
0.6303842387953887
0.5152979908671131
0.9928207761805053
0.9913514557124004
0.5060654310356865
0.9694702204192752
0.9658512634599362


100%|██████████| 45/45 [00:00<00:00, 139.25it/s]


0.6155171814780587
0.49954907647888736
0.716253639395113
0.7618525388005946
0.972967652946108
0.9928344337233268
0.6277670731828182
Percent of Nans: 0.0


In [44]:
prep_df.head()

Unnamed: 0,ethnicity_African American,ethnicity_Asian,ethnicity_Caucasian,ethnicity_Hispanic,ethnicity_Native American,ethnicity_Other/Unknown,gender_F,gender_M,hospital_admit_source_Acute Care/Floor,hospital_admit_source_Chest Pain Center,...,h1_calcium_max,h1_calcium_min,h1_creatinine_max,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis
1,0,0,1,0,0,0,0,1,0,0,...,4.610996e-15,9.092629e-15,-6.395302e-16,0,0,0,0,0,0,0
2,0,0,1,0,0,0,1,0,0,0,...,0.8457641,0.8572557,-1.391968,0,0,0,0,0,0,0
3,0,0,1,0,0,0,1,0,0,0,...,4.610996e-15,9.092629e-15,-6.395302e-16,0,0,0,0,0,0,0
4,0,0,1,0,0,0,1,0,0,0,...,4.610996e-15,9.092629e-15,-6.395302e-16,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,1,0,0,...,4.610996e-15,9.092629e-15,-6.395302e-16,0,0,0,0,0,0,0


In [45]:
from sklearn.model_selection import GridSearchCV, train_test_split
from scipy.sparse import csc_matrix

cat_cols = []
for each in prep_df.columns:
    for origin in tf1[-1]:
        if origin in each:
            cat_cols.append(each)

cat_df = prep_df[cat_cols]
num_df = prep_df.drop(cat_cols, axis=1)
prep_df = pd.concat([num_df, cat_df], axis=1)

X = csc_matrix(prep_df.values)
y_ = y.values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y_, stratify=y_, train_size=0.75, random_state=42)

In [49]:
# Grid Search
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
# https://scikit-learn.org/stable/modules/model_evaluation.html#multimetric-scoring
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_multi_metric_evaluation.html#sphx-glr-auto-examples-model-selection-plot-multi-metric-evaluation-py

# Ensemble Methods
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html#sklearn.ensemble.StackingClassifier

# GridSearch + CV on non-sklearn models
# https://www.kaggle.com/tilii7/hyperparameter-grid-search-with-xgboost

from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score, recall_score, precision_score
from sklearn import metrics, model_selection

# scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score), 'F1-score': make_scorer(f1_score), 'Recall': make_scorer(recall_score), 'Precision': make_scorer(precision_score)}
scoring = {'AUC': 'roc_auc'}

def grid(estimator, scoring, cv, X, y, verbose):
    return GridSearchCV(
    estimator = estimator,
    param_grid = params,
    scoring = scoring,
    cv = cv,
    verbose = verbose,
    return_train_score = True,
    refit="AUC",
    n_jobs = -3 # If GPU, set to 1
    ).fit(X, y)


In [50]:
# acc = accuracy_score(prep_test, y_pred)
# f1 = f1_score(prep_test, y_pred)
# rec = recall_score(prep_test, y_pred)
# prec = precision_score(prep_test, y_pred)

In [51]:
# [Jaewoong] Convert to CLF
# https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/
# https://machinelearningmastery.com/use-keras-deep-learning-models-scikit-learn-python/
# https://stackoverflow.com/questions/63381301/how-many-neurons-should-be-in-the-last-layer-of-the-neural-network
# https://www.dlology.com/blog/how-to-choose-last-layer-activation-and-loss-function/
import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from tensorflow.keras.models import * 
from tensorflow.keras.layers import * 
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import SGD, Adagrad, Adam, Nadam

def return_optimizer(name):
    if name == "SGD":
        return SGD
    elif name == "Adagrad":
        return Adagrad
    elif name == "Adam":
        return Adam
    elif name == "Nadam":
        return Nadam

def buildmodel(optimizer, learn_rate, init_mode, activation, dropout_rate, neurons):
    K.clear_session()
    model = Sequential()
    model.add(Dense(neurons, input_dim=X_train.shape[1], kernel_initializer=init_mode, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons//2, kernel_initializer=init_mode, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons//3, kernel_initializer=init_mode, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons//4, kernel_initializer=init_mode, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, kernel_initializer=init_mode, activation='sigmoid'))
    opt_object = return_optimizer(optimizer)
    model.compile(loss='binary_crossentropy', optimizer=opt_object(lr=learn_rate), metrics=['AUC'])
    return model
    
# params = {
#     "epochs": [100, 200, 300],
#     "optimizer": ['SGD', 'Adagrad', 'Adam',],
#     "learn_rate": [0.1, 0.01],
#     "init_mode": ['glorot_normal', 'he_normal', 'lecun_uniform'],
#     "activation": ['relu', 'tanh', 'sigmoid'],
#     "dropout_rate": [0.1, 0.2, 0.3],
#     "neurons": [10, 50, 100, 200],
#     "batch_size": [1000, 5000, 10000]
# }
params = {
    "epochs": [300],
    "optimizer": ['SGD', 'Adam',],
    "learn_rate": [0.1, 0.01],
    "init_mode": ['glorot_normal', 'he_normal'],
    "activation": ['relu'],
    "dropout_rate": [0.1, 0.3],
    "neurons": [50, 100],
    "batch_size": [5000]
}
# params = {
#     "epochs": [1],
#     "optimizer": ['SGD'],
#     "learn_rate": [0.1],
#     "init_mode": ['glorot_normal'],
#     "activation": ['relu'],
#     "dropout_rate": [0.1],
#     "neurons": [10],
#     "batch_size": [10000]
# }

In [52]:
# Check if n_jobs impacts GPU
estimator= KerasClassifier(build_fn=buildmodel, verbose=1)
estimator._estimator_type = "classifier"

In [53]:
keras_grid_search = grid(estimator, scoring, 3, X_train, y_train, 1)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300

In [54]:
keras_grid_search.best_score_

0.8403553637407537

In [55]:
keras_grid_search.best_params_

{'activation': 'relu',
 'batch_size': 5000,
 'dropout_rate': 0.1,
 'epochs': 300,
 'init_mode': 'glorot_normal',
 'learn_rate': 0.1,
 'neurons': 100,
 'optimizer': 'SGD'}

In [56]:
from sklearn.metrics import roc_auc_score
roc_auc_score(keras_grid_search.best_estimator_.predict(X_test), y_test)



0.7359420279684008

In [None]:
# Preprocess on test-set -> Evaluate
# scaler = scaler()
# prep_train = scaler.fit_transform(train)

# prep_test = scaler.transform(test)
# prep_test = scaler.fit_transform(test) # X -> Data Leakage

In [None]:
# Clustering
# WSS, BSS, Entropy, Purity
# Kmeans -> 3~5
# df.desrcibe() for each cluster

In [None]:
# #Preprocessing Test dataset

# test_df1 = test_df.iloc[:, :45]
# prep_tf1,tff = preProcessing1(test_df1, y)
# prep_tf1.index = test_df.index

# test_df2 = test_df.iloc[:, 45:90]
# prep_tf2,tff = preProcessing2(test_df2, y)
# prep_tf2.index = test_df.index

# test_df3 = test_df.iloc[:, 90:135]
# prep_tf3,tff = preProcessing3(test_df3, y)
# prep_tf3.index = test_df.index

# test_df4 = test_df.iloc[:,135:]
# prep_tf4,tff = preProcessing4(test_df4, y)
# prep_tf4.index = test_df.index

# test_df1 = test.iloc[:, :45]
# def test_prep1(df: pd.DataFrame, tf: list) -> pd.DataFrame:
#   for each in tf:
#     df = each.transform(df)

#   return df

# prep_tf = pd.concat([prep_tf1, prep_tf2, prep_tf3, prep_tf4], axis=1)
# prep_tf

# [Jaewoong]
# extract numberical columns
# discretize
# feature selection
# -> optimal subset of features


# [Uma]-Done
# Oversampling or Undersampling (=Resampling)
# negative class = 75%
# positive class = 25%

In [None]:
# #Preprocessing Test dataset

# test_df1 = test_df.iloc[:, :45]
# prep_tf1,tff = preProcessing1(test_df1, y)
# prep_tf1.index = test_df.index

# test_df2 = test_df.iloc[:, 45:90]
# prep_tf2,tff = preProcessing2(test_df2, y)
# prep_tf2.index = test_df.index

# test_df3 = test_df.iloc[:, 90:135]
# prep_tf3,tff = preProcessing3(test_df3, y)
# prep_tf3.index = test_df.index

# test_df4 = test_df.iloc[:,135:]
# prep_tf4,tff = preProcessing4(test_df4, y)
# prep_tf4.index = test_df.index

# prep_tf = pd.concat([prep_tf1, prep_tf2, prep_tf3, prep_tf4], axis=1)
# prep_tf

# df1 = df.iloc[:, :45]
# def train_prep1(df: pd.DataFrame) -> (pd.DataFrame, list):
#   # Examples
#   transformer1 = SimpleImputer()
#   df = transformer1.fit_transform(df)
#   transformer2 = StandardScaler()
#   df = transformer2.fit_transform(df)
#   # MDLP
#   # LinearRegression.fit_transform
#   # ...

#   return df, [transformer1, transformer2]
# prep_df1, tf1 = train_prep1(df1)

# # Do the same for 2~4...

# prep_df = pd.concat([prep_df1, prep_df2, prep_df3, prep_df4], axis=1)


# test_df1 = test.iloc[:, :45]
# def test_prep1(df: pd.DataFrame, tf: list) -> pd.DataFrame:
#   for each in tf:
#     df = each.transform(df)

#   return df
# prep_test_df1 = test_prep1(test_df1, tf1)
# # Do the same for 2~4...
# prep_test_df = pd.concat([prep_test_df1, prep_test_df2, prep_test_df3, prep_test_df4], axis=1)