In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
import warnings
warnings.simplefilter("ignore")
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

url_1 = 'https://raw.githubusercontent.com/takanju/wids_datathon_2021/master/TrainingWiDS2021.csv'
train_local_path = './data/TrainingWiDS2021.csv'
url_2 = 'https://raw.githubusercontent.com/takanju/wids_datathon_2021/master/UnlabeledWiDS2021.csv'
test_local_path = './data/UnlabeledWiDS2021.csv'

In [7]:
# Splitted data to make same shape of both test and train so that we can apply pre processing on both
# Ref : https://www.kaggle.com/siavrez/2020fatures
medical_data = pd.read_csv(train_local_path, error_bad_lines=False, index_col=0)
test = pd.read_csv(test_local_path, error_bad_lines=False, index_col=0)
y = medical_data["diabetes_mellitus"]
del medical_data['diabetes_mellitus']

In [14]:
medical_data.shape

(130157, 179)

In [15]:
# Percentage of missing values 
# Is it showing 179 columns?
# No.. it gives for 60 as passed in argument ..
# pd.DataFrame(medical_data.isna().sum()*100/len(medical_data))

# Pre processing by Aishwarya

In [16]:
def preProcessing1(df):
    df=df.drop(['encounter_id', 'hospital_id', 'icu_id', 'urineoutput_apache'], axis=1)
    #preprocessing for age, height, weight, bmi
    df['age'].fillna((df['age'].mean()), inplace=True)
    df['bmi'].fillna((df['bmi'].mean()), inplace=True)
    df['height'].fillna((df['height'].mean()), inplace=True)
    df['weight'].fillna((df['weight'].mean()), inplace=True)

    #preprocessing of categorical variable
    df['ethnicity'] = df['ethnicity'].fillna(df['ethnicity'].mode().iloc[0])
    df['gender'] = df['gender'].fillna(df['gender'].mode( ).iloc[0])
    df['hospital_admit_source'] = df['hospital_admit_source'].fillna(df['hospital_admit_source'].mode().iloc[0])
    df['icu_admit_source'] = df['icu_admit_source'].fillna(df['icu_admit_source'].mode().iloc[0])
    df['icu_stay_type'] = df['icu_stay_type'].fillna(df['icu_stay_type'].mode().iloc[0])
    df['icu_type'] = df['icu_type'].fillna(df['icu_type'].mode().iloc[0])

    # One Hot Encoding
    # Scaler
    # apache_2_diagnosis <- impute..., All other numerical features

    return df, []

# d=preProcessing1(medical_data_reduced.copy())
# d

# [What kind of numerical columns you used]

In [17]:
cat_cols = ['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']

def preProcessing1(df: pd.DataFrame) -> pd.DataFrame:
    df=df.drop(['encounter_id', 'hospital_id', 'icu_id', 'urineoutput_apache'], axis=1)
    cols = df.columns
    cat_cols = ['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']
    
    num_df = df.loc[:, ~df.columns.isin(cat_cols)]
    print(num_df.shape)
    num_cols = num_df.columns
    cat_df = df.loc[:, cat_cols]
    print(cat_df.shape)
    
    # Simple Imputing
    imputer = SimpleImputer(strategy="mean")
    imputed_df = imputer.fit_transform(num_df.values)
    print('Impute Completed')
    print(imputed_df.shape)

    # Standardization
    scaler = StandardScaler()
    imputed_scaled_df = scaler.fit_transform(imputed_df)
    print('Standardization Completed')
    print(imputed_scaled_df.shape)
    
    num_df = pd.DataFrame(columns=num_cols, data=imputed_df)
    print(num_df.shape)
    
    index_length = df.shape[0]
    num_df.index = range(0, index_length)
    cat_df.index = range(0, index_length)
    df = pd.concat([num_df, cat_df], axis=1)
    print(df.shape)
    
    return df, [imputer, scaler]

# Pre processing by Anjali

In [18]:
#!pip install tqdm
from sklearn.linear_model import LinearRegression, LassoCV
from tqdm import tqdm
import pickle
from scipy import stats, special
import pandas as pd
import numpy as np
import os
import warnings
warnings.simplefilter("ignore")

# Ref: https://www.kaggle.com/shashankasubrahmanya/missing-data-imputation-using-regression

# filling missing values based on linear regression and the most correlated variables
# linear regression -> F-test (which columns are mostly related with the given target column, other than "diabetes")
# Multi-colinearity
# target = diabetes
# Remove features which has more than 50% percentage of missing values
# Return reduced dataset

# def drop_NaN_Values(df, threshold):
#     NaN_cols = []
#     for col in df.columns:
#         NaN_ratio = df[col].isnull().sum() / df.shape[0]
#         if NaN_ratio >= threshold:
#             NaN_cols.append(col)
#     df = df.drop(NaN_cols, axis=1)
#     return df

# medical_data_reduced = drop_NaN_Values(medical_data.copy(), 0.5)

# medical_data_reduced.shape

# Doesn't fill 100%. We need to fill so or drop null rows
def fillna_using_linear_model(df):
    fea_cols=[]
    for col in df.columns:
        if df[col].dtype=='float64':
            fea_cols.append(col)

    correl = df[fea_cols].corr()

    for col in tqdm(fea_cols):
        nan_ratio = df[col].isnull().sum() / df.shape[0]
        if nan_ratio > 0:
            best_nan_ratio = nan_ratio
            best_col = None
            for id in correl.loc[(correl[col] > 0.7) | (correl[col] < -0.7), col].index:
                nan_temp_ratio = df[id].isnull().sum() / df.shape[0]
                if best_nan_ratio > nan_temp_ratio:
                    best_nan_ratio = nan_temp_ratio
                    best_col = id
            if best_col != None:
                sub = df[[col, best_col]].copy()
                sub = sub.dropna()
                reg = LinearRegression(fit_intercept=True).fit(np.expand_dims(sub[best_col], axis=1), sub[col])
                #print(reg.score(np.expand_dims(sub[best_col], axis=1), sub[col]))
                if reg.score(np.expand_dims(sub[best_col], axis=1), sub[col])>0.7:
                    if df.loc[(~df[best_col].isnull()) & (df[col].isnull()), col].shape[0] > 0:
                        df.loc[(~df[best_col].isnull()) & (df[col].isnull()), col] = \
                        reg.predict(np.expand_dims(df.loc[(~df[best_col].isnull()) & (df[col].isnull()), best_col], axis=1))

    return df, [reg]

# def new_features(df):
#   df["d1_heartrate_max_min_diff"] = df["d1_heartrate_max"] - df["d1_heartrate_min"]
#   return df


def preProcessing2(df: pd.DataFrame, y:pd.Series) -> pd.DataFrame:
    col=df.columns
    df_imp, tf = fillna_using_linear_model(df)
  
    # Standardization
    scaler = StandardScaler()
    imputed_scaled_df = scaler.fit_transform(df_imp)
    imputed_scaled_df = pd.DataFrame(data=imputed_scaled_df, columns=df.columns)
    tf.append(scaler)

    # df_imp = new_features(df)
    return imputed_scaled_df, tf

# dd = preProcessing2(medical_data_reduced.copy())
# dd
# [numerical columns]

In [19]:
def preProcessing2(df: pd.DataFrame) -> pd.DataFrame:
    columns = list(df.columns)
    
  # Simple Imputing
    imputer = SimpleImputer(strategy="mean")
    imputed_df = imputer.fit_transform(df.values)
    print('Impute Completed')

  # Standardization
    scaler = StandardScaler()
    imputed_scaled_df = scaler.fit_transform(imputed_df)
    print('Standardization Completed')
    
    df = pd.DataFrame(columns=columns, data=imputed_df)
    
    return df, [imputer, scaler]

# Pre processing by Jae Woong

In [20]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer # MICE 
from sklearn.preprocessing import StandardScaler
from discretization import MDLP
# from mdlp.discretization import MDLP

def preProcessing3(df: pd.DataFrame) -> pd.DataFrame:
    columns = list(df.columns)

  # Log transform for skewing data
  # https://stats.stackexchange.com/questions/267078/why-is-skewed-data-not-preferred-for-modelling
  # https://stats.stackexchange.com/questions/299154/the-benefit-of-unskewing-skewed-data
    logs_transform_list = ['d1_bilirubin_min', 'd1_bilirubin_max', 'd1_glucose_max', 'h1_bilirubin_max', 'h1_bilirubin_min', 'h1_bun_max', 'h1_bun_min']
    df[logs_transform_list] = np.log2(df[logs_transform_list])
    print('Log transformation Completed')
    
  # Simple Imputing
    imputer = SimpleImputer(strategy="mean")
    imputed_df = imputer.fit_transform(df.values)
    print('Impute Completed')

  # Standardization
    scaler = StandardScaler()
    imputed_scaled_df = scaler.fit_transform(imputed_df)
    print('Standardization Completed')

    df = pd.DataFrame(columns=df.columns, data=imputed_df)

    return df, [{"columns": logs_transform_list, "transform": np.log2}, imputer, scaler]

# Pre processing by Uma

In [21]:
#ref: https://www.kaggle.com/lhagiimn/7th-place-solution-wids-2021
#ref:https://www.kaggle.com/letianyu/wids-2021-notebook
#generating new features to improve the score
import pandas as pd
# OHE for categorical columns - Not performing One Hot Encoding as all features are in binary 0/1.
#No numerical features


def remove_NaN_Values(df, threshold=0.5):
    # store which columns, and drop the same at test set
    NaN_cols = []
    for col in df.columns:
        NaN_ratio = df[col].isnull().sum() / df.shape[0]
        if NaN_ratio >= threshold:
            NaN_cols.append(col)
    df = df.drop(NaN_cols, axis=1)
    return df

def preProcessing4(df: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
    columns = df.columns

    Removed_NaN_df = remove_NaN_Values(df4)

    return Removed_NaN_df, [remove_NaN_Values]

# df4 = medical_data.iloc[:, 135:]
# target = 'diabetes_mellitus'
# df4.drop(target, axis=1)
# prep_df4, tf4 = preProcessing4(df4, target)

# prep_df4, tf4 = prep4(df4)

# prep_df = pd.concat([prep_df1, prep_df2, prep_df3, prep_df4], axis=1)


# Combine all pre processings 

In [22]:
# Aishwarya
df1 = medical_data.iloc[:, :45]
prep_df1, tf1 = preProcessing1(df1)

(130157, 35)
(130157, 6)
Impute Completed
(130157, 35)
(130157, 35)
(130157, 41)


In [23]:
# Anjali
df2 = medical_data.iloc[:, 45:90]
prep_df2, tf2 = preProcessing2(df2)

Impute Completed


In [24]:
# Jaewoong
df3 = medical_data.iloc[:, 90:135]
prep_df3, tf3 = preProcessing3(df3)

Log transformation Completed
Impute Completed


In [25]:
# Uma
df4 = medical_data.iloc[:, 135:]
prep_df4, tf4 = preProcessing4(df4, y)

In [43]:
cat_cols += list(prep_df4.columns)

In [26]:
prep_df1.shape

(130157, 41)

In [27]:
prep_df2.shape

(130157, 45)

In [28]:
prep_df3.shape

(130157, 45)

In [29]:
prep_df4.shape

(130157, 7)

In [30]:
for each in [prep_df1, prep_df2, prep_df3, prep_df4]:
    each.index = range(0, 130157)

In [31]:
prep_df = pd.concat([prep_df1, prep_df2, prep_df3, prep_df4], axis=1)
prep_df.shape

(130157, 138)

In [32]:
prep_df.dropna(inplace=True)
y = y.loc[prep_df.index]
prep_df1 = prep_df1.loc[prep_df.index]
prep_df2 = prep_df2.loc[prep_df.index]
prep_df3 = prep_df3.loc[prep_df.index]
prep_df4 = prep_df4.loc[prep_df.index]
prep_df.shape

(95559, 138)

In [33]:
y.shape

(95559,)

In [34]:
prep_df1.shape

(95559, 41)

In [35]:
prep_df4.shape

(95559, 7)

In [36]:
all(prep_df.index == y.index)

True

# binning, and then feature selection
# or use feature importance from random forest
# or dimensionality reduction?

In [44]:
prep_df1_num_columns = set(prep_df1.columns) - set(cat_cols)
numerical_columns = list(prep_df1_num_columns) + list(prep_df2.columns) + list(prep_df3.columns)
len(numerical_columns)

125

In [45]:
len(cat_cols)

13

In [46]:
prep_df.shape[1]

138

In [104]:
from fcbf import fcbf
feature_set, history = fcbf(prep_discretized_df2, y, threshold=0, base=2, is_debug=True)

ethnicity
gender
hospital_admit_source
icu_admit_source
icu_stay_type
icu_type
aids
cirrhosis
hepatic_failure
immunosuppression
leukemia
lymphoma
solid_tumor_with_metastasis
original features =  [('icu_stay_type', 0.0007429549601396547), ('icu_type', 0.0003386539843056739), ('hospital_admit_source', 0.00016717473775491575), ('icu_admit_source', 0.00011462636317424939), ('solid_tumor_with_metastasis', 0.00010795488941162129), ('aids', 7.380556352327899e-05), ('immunosuppression', 4.4247773043782035e-05), ('ethnicity', 3.434455366694478e-05), ('cirrhosis', 3.042511006912982e-05), ('hepatic_failure', 1.9298580778469002e-05), ('leukemia', 1.798427075723074e-05), ('gender', 1.2275541790275845e-05), ('lymphoma', 1.6333667428365178e-07)]
	 Fj =  icu_stay_type
icu_type
		 Fi =  icu_type
			 (Redundant) SUij =  0.00538165094703889 SUic 0.0003386539843056739
hospital_admit_source
		 Fi =  hospital_admit_source
			 (Redundant) SUij =  0.00332776046027142 SUic 0.00016717473775491575
icu_admit_sour

In [105]:
feature_set

[('icu_stay_type', 0.0007429549601396547),
 ('solid_tumor_with_metastasis', 0.00010795488941162129)]

In [106]:
history

{'icu_stay_type': [{'icu_type': 0.00538165094703889},
  {'hospital_admit_source': 0.00332776046027142},
  {'icu_admit_source': 0.01272207418438055},
  {'aids': 0.0002885737592956732},
  {'immunosuppression': 0.0002506784746738779},
  {'ethnicity': 0.00165734533279555},
  {'cirrhosis': 8.837129786647152e-05},
  {'leukemia': 0.00015226968805869912},
  {'gender': 3.612542080280297e-05},
  {'lymphoma': 0.0004759941463592165}],
 'solid_tumor_with_metastasis': [{'hepatic_failure': 0.00022697795477858422}]}

In [None]:
# Pseudo code
# df1 = df.iloc[:, :45]
# prep_df1, tf1 = prep1(df1)

# df2 = df.iloc[:, 45:90]
# prep_df2, tf2 = prep2(df2)

# df3 = df.iloc[:, 90:135]
# prep_df3, tf3 = prep3(df3)

# df4 = df.iloc[:, 135:]
# prep_df4, tf4 = prep4(df4)

# prep_df = pd.concat([prep_df1, prep_df2, prep_df3, prep_df4], axis=1)


# test_df1 = test_df.iloc[:, :45]
# prep_test_df1 = test_prep1(test_df1, tf1)

# test_df2 = test_df.iloc[:, 45:90]
# prep_test_df2 = test_prep2(test_df2, tf2)

# test_df3 = test_df.iloc[:, 90:135]
# prep_test_df3 = test_prep3(test_df3, tf3)

# test_df4 = test_df.iloc[:, 135:]
# prep_test_df4 = test_prep4(test_df4, tf4)

# prep_test_df = pd.concat([prep_test_df1, prep_test_df2, prep_test_df3, prep_test_df4], axis=1)

In [None]:
df1 = df.iloc[:, :45]
def train_prep1(df: pd.DataFrame) -> (pd.DataFrame, list):
  # Examples
  transformer1 = SimpleImputer()
  df = transformer1.fit_transform(df)
  transformer2 = StandardScaler()
  df = transformer2.fit_transform(df)
  # MDLP
  # LinearRegression.fit_transform
  # ...

  return df, [transformer1, transformer2]
prep_df1, tf1 = train_prep1(df1)

# Do the same for 2~4...



In [None]:
prep_df = pd.concat([prep_df1, prep_df2, prep_df3, prep_df4], axis=1)


test_df1 = test.iloc[:, :45]
def test_prep1(df: pd.DataFrame, tf: list) -> pd.DataFrame:
  for each in tf:
    df = each.transform(df)

  return df
prep_test_df1 = test_prep1(test_df1, tf1)
# Do the same for 2~4...
prep_test_df = pd.concat([prep_test_df1, prep_test_df2, prep_test_df3, prep_test_df4], axis=1)

In [None]:
# For test data
# Before imputing make sure size of train and test data should be same so delete target from train and keep train's target as y
# I did same on the top (in commented since Jae woong used target value in its pre processing. @Jaewoong: pls make some changes with target name in your function )

test1 = preProcessing1(test.copy())
test2 = preProcessing2(test1.copy())
test3 = preProcessing3(test2.copy())
#test4 = preProcessing4(test3.copy())

In [None]:
target = 'diabetes_mellitus'
medical_data[target].value_counts().plot(kind="pie", explode=[0,0.1], autopct="%.2f", labels=["No","Yes"])
plt.show()

In [None]:
#The dataset is skewed towards class 0, so,balace the dataset.
df_majority = medical_data[medical_data['diabetes_mellitus']==0]
df_minority = medical_data[medical_data['diabetes_mellitus']==1]

# Upsampling
df_minority_upsampled = resample(df_minority,
                                 replace=True,       # sample with replacement
                                 n_samples=83798,    # to match majority class
                                 random_state= 303)  # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.diabetes_mellitus.value_counts()

In [None]:
# [Jaewoong]
# extract numberical columns
# discretize
# feature selection
# -> optimal subset of features


# [Uma]-Done
# Oversampling or Undersampling (=Resampling)
# negative class = 75%
# positive class = 25%

In [None]:
# [Aishwarya, Anjali]
# Training our models
# (algorithms(+hparams), CV, gridsearch)

In [None]:
# Preprocess on test-set -> Evaluate
# scaler = scaler()
# prep_train = scaler.fit_transform(train)

# prep_test = scaler.transform(test)
# prep_test = scaler.fit_transform(test) # X -> Data Leakage

In [None]:
# Clustering
# WSS, BSS, Entropy, Purity