In [15]:
import numpy as np
import pandas as pd
data = pd.read_csv("/Users/sakshamjain/Desktop/Projects/JAIN-WIN/widsdatathon2021/TrainingWiDS2021.csv", index_col=0)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:20,.2f}'.format)

In [16]:
# Calculate the percentage of missing values for each column
missing_percentages = data.isnull().sum() * 100 / len(data)
missing_percentages_sorted = missing_percentages.sort_values(ascending=False)
print(missing_percentages_sorted)

h1_bilirubin_max                             92.09
h1_bilirubin_min                             92.09
h1_albumin_max                               91.43
h1_albumin_min                               91.43
h1_lactate_max                               91.02
h1_lactate_min                               91.02
h1_pao2fio2ratio_min                         87.12
h1_pao2fio2ratio_max                         87.12
h1_arterial_ph_max                           82.86
h1_arterial_ph_min                           82.86
h1_arterial_pco2_min                         82.72
h1_arterial_pco2_max                         82.72
h1_arterial_po2_max                          82.55
h1_arterial_po2_min                          82.55
h1_hco3_max                                  81.74
h1_hco3_min                                  81.74
h1_wbc_max                                   81.43
h1_wbc_min                                   81.43
h1_calcium_min                               81.38
h1_calcium_max                 

In [17]:
# Identify columns with more than 75% missing values and drop them
columns_to_drop = missing_percentages[missing_percentages > 70].index
data_cleaned = data.drop(columns=columns_to_drop)
data_cleaned.to_csv('1.csv', index=False)
print(data_cleaned.columns)

Index(['encounter_id', 'hospital_id', 'age', 'bmi', 'elective_surgery', 'ethnicity', 'gender', 'height', 'hospital_admit_source', 'icu_admit_source',
       ...
       'd1_arterial_po2_max', 'd1_arterial_po2_min', 'aids', 'cirrhosis', 'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis', 'diabetes_mellitus'], dtype='object', length=125)


In [24]:
# Identifying categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
categorical_columns
for col in ['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']:
    print(f"{col}: {data[col].nunique()}")

ethnicity: 6
gender: 2
hospital_admit_source: 15
icu_admit_source: 5
icu_stay_type: 3
icu_type: 8


Mean Imputer

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the data
data = pd.read_csv('1.csv')

# Binary encoding for 'gender'
data['gender'] = data['gender'].astype('category').cat.codes

# One-hot encoding for other categorical columns
categorical_columns = ['ethnicity', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']
data = pd.get_dummies(data, columns=categorical_columns)

# Splitting the data into train and test sets
X = data.drop(['diabetes_mellitus', 'encounter_id', 'hospital_id'], axis=1)
y = data['diabetes_mellitus'].astype(int)  # Ensure the target is integer
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Imputation on training and test sets separately
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Scaling
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_imputed), columns=X_train_imputed.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_imputed), columns=X_test_imputed.columns)  

# Training the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test_scaled)
y_prob = rf_classifier.predict_proba(X_test_scaled)[:, 1]  # probabilities for AUC

# Metrics
classification_rep = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(classification_rep)
print('AUC ROC:', roc_auc)

              precision    recall  f1-score   support

           0       0.84      0.95      0.89     30574
           1       0.67      0.34      0.45      8474

    accuracy                           0.82     39048
   macro avg       0.75      0.65      0.67     39048
weighted avg       0.80      0.82      0.80     39048

AUC ROC: 0.8247452093504967


MICE imputer

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.experimental import enable_iterative_imputer  # This line enables the experimental features
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.tree import DecisionTreeClassifier


# Load the data
data = pd.read_csv('1.csv')

# Binary encoding for 'gender'
data['gender'] = data['gender'].astype('category').cat.codes

# One-hot encoding for other categorical columns
categorical_columns = ['ethnicity', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']
data = pd.get_dummies(data, columns=categorical_columns)

# Splitting the data into train and test sets
X = data.drop(['diabetes_mellitus', 'encounter_id', 'hospital_id'], axis=1)
y = data['diabetes_mellitus'].astype(int)  # Ensure the target is integer
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# MICE imputation on training and test sets separately
mice_imputer = IterativeImputer(random_state=42,max_iter=10,n_nearest_features=10)
X_train_imputed = pd.DataFrame(mice_imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(mice_imputer.transform(X_test), columns=X_test.columns)

# Scaling
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_imputed), columns=X_train_imputed.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_imputed), columns=X_test_imputed.columns)

# Training the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test_scaled)
y_prob = rf_classifier.predict_proba(X_test_scaled)[:, 1]  # probabilities for AUC

# Metrics
classification_rep = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(classification_rep)
print('AUC ROC:', roc_auc)



              precision    recall  f1-score   support

           0       0.84      0.95      0.89     30574
           1       0.66      0.35      0.46      8474

    accuracy                           0.82     39048
   macro avg       0.75      0.65      0.67     39048
weighted avg       0.80      0.82      0.80     39048

AUC ROC: 0.8235569985397327


SICE imputer (mean of MICE)

In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Load the data
data = pd.read_csv('1.csv')

# Binary encoding for 'gender'
data['gender'] = data['gender'].astype('category').cat.codes

# One-hot encoding for other categorical columns
categorical_columns = ['ethnicity', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']
data = pd.get_dummies(data, columns=categorical_columns)

# Splitting the data into train and test sets
X = data.drop(['diabetes_mellitus', 'encounter_id', 'hospital_id'], axis=1)
y = data['diabetes_mellitus'].astype(int)  # Ensure the target is integer
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the number of imputations
m = 10  # number of multiple imputations

# Prepare an empty array to store each set of imputations for both train and test data
imputed_X_train = np.zeros((m, *X_train.shape))
imputed_X_test = np.zeros((m, *X_test.shape))

# Run MICE m times for both train and test sets with progress bar
for i in tqdm(range(m), desc="Imputing Data", unit="imputation"):
    imputer = IterativeImputer(random_state=i, max_iter=10, sample_posterior=False,n_nearest_features=10)
    imputed_X_train[i] = imputer.fit_transform(X_train)
    imputed_X_test[i] = imputer.transform(X_test)

# Calculate the mean of the imputations
mean_imputed_X_train = np.mean(imputed_X_train, axis=0)
mean_imputed_X_test = np.mean(imputed_X_test, axis=0)

# Convert back to DataFrame
X_train_imputed = pd.DataFrame(mean_imputed_X_train, columns=X_train.columns)
X_test_imputed = pd.DataFrame(mean_imputed_X_test, columns=X_test.columns)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Training the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test_scaled)
y_prob = rf_classifier.predict_proba(X_test_scaled)[:, 1]  # probabilities for AUC

# Metrics
classification_rep = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(classification_rep)
print('AUC ROC:', roc_auc)

Imputing Data: 100%|██████████| 10/10 [03:41<00:00, 22.18s/imputation]


              precision    recall  f1-score   support

           0       0.84      0.95      0.89     30574
           1       0.67      0.35      0.46      8474

    accuracy                           0.82     39048
   macro avg       0.75      0.65      0.68     39048
weighted avg       0.80      0.82      0.80     39048

AUC ROC: 0.8219820252480511


MICE with SMOTE

In [46]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.experimental import enable_iterative_imputer  # This line enables the experimental features
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd

data = pd.read_csv('1.csv')

# Binary encoding for 'gender'
data['gender'] = data['gender'].astype('category').cat.codes

# One-hot encoding for other categorical columns
categorical_columns = ['ethnicity', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']
data = pd.get_dummies(data, columns=categorical_columns)

# Splitting the data into train and test sets
X = data.drop(['diabetes_mellitus', 'encounter_id', 'hospital_id'], axis=1)
y = data['diabetes_mellitus'].astype(int)  # Ensure the target is integer
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# MICE imputation on training and test sets separately
mice_imputer = IterativeImputer(random_state=42,max_iter=10,n_nearest_features=10)
X_train_imputed = pd.DataFrame(mice_imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(mice_imputer.transform(X_test), columns=X_test.columns)

# Scaling
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_imputed), columns=X_train_imputed.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_imputed), columns=X_test_imputed.columns)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Training the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Predictions
y_pred = rf_classifier.predict(X_test_scaled)
y_prob = rf_classifier.predict_proba(X_test_scaled)[:, 1]  # probabilities for AUC

# Metrics
classification_rep = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(classification_rep)
print('AUC ROC:', roc_auc)



              precision    recall  f1-score   support

           0       0.88      0.88      0.88     30574
           1       0.57      0.56      0.56      8474

    accuracy                           0.81     39048
   macro avg       0.72      0.72      0.72     39048
weighted avg       0.81      0.81      0.81     39048

AUC ROC: 0.8290821798712168


GAIN code

In [4]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()


import numpy as np
from tqdm import tqdm

from utils import normalization, renormalization, rounding
from utils import xavier_init
from utils import binary_sampler, uniform_sampler, sample_batch_index


def gain (data_x, gain_parameters):
  '''Impute missing values in data_x
  
  Args:
    - data_x: original data with missing values
    - gain_parameters: GAIN network parameters:
      - batch_size: Batch size
      - hint_rate: Hint rate
      - alpha: Hyperparameter
      - iterations: Iterations
      
  Returns:
    - imputed_data: imputed data
  '''
  # Define mask matrix
  data_m = 1-np.isnan(data_x)
  
  # System parameters
  batch_size = gain_parameters['batch_size']
  hint_rate = gain_parameters['hint_rate']
  alpha = gain_parameters['alpha']
  iterations = gain_parameters['iterations']
  
  # Other parameters
  no, dim = data_x.shape
  
  # Hidden state dimensions
  h_dim = int(dim)
  
  # Normalization
  norm_data, norm_parameters = normalization(data_x)
  norm_data_x = np.nan_to_num(norm_data, 0)
  
  ## GAIN architecture   
  # Input placeholders
  # Data vector
  X = tf.placeholder(tf.float32, shape = [None, dim])
  # Mask vector 
  M = tf.placeholder(tf.float32, shape = [None, dim])
  # Hint vector
  H = tf.placeholder(tf.float32, shape = [None, dim])
  
  # Discriminator variables
  D_W1 = tf.Variable(xavier_init([dim*2, h_dim])) # Data + Hint as inputs
  D_b1 = tf.Variable(tf.zeros(shape = [h_dim]))
  
  D_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
  D_b2 = tf.Variable(tf.zeros(shape = [h_dim]))
  
  D_W3 = tf.Variable(xavier_init([h_dim, dim]))
  D_b3 = tf.Variable(tf.zeros(shape = [dim]))  # Multi-variate outputs
  
  theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]
  
  #Generator variables
  # Data + Mask as inputs (Random noise is in missing components)
  G_W1 = tf.Variable(xavier_init([dim*2, h_dim]))  
  G_b1 = tf.Variable(tf.zeros(shape = [h_dim]))
  
  G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
  G_b2 = tf.Variable(tf.zeros(shape = [h_dim]))
  
  G_W3 = tf.Variable(xavier_init([h_dim, dim]))
  G_b3 = tf.Variable(tf.zeros(shape = [dim]))
  
  theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]
  
  ## GAIN functions
  # Generator
  def generator(x,m):
    # Concatenate Mask and Data
    inputs = tf.concat(values = [x, m], axis = 1) 
    G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
    G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)   
    # MinMax normalized output
    G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) 
    return G_prob
      
  # Discriminator
  def discriminator(x, h):
    # Concatenate Data and Hint
    inputs = tf.concat(values = [x, h], axis = 1) 
    D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)  
    D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
    D_logit = tf.matmul(D_h2, D_W3) + D_b3
    D_prob = tf.nn.sigmoid(D_logit)
    return D_prob
  
  ## GAIN structure
  # Generator
  G_sample = generator(X, M)
 
  # Combine with observed data
  Hat_X = X * M + G_sample * (1-M)
  
  # Discriminator
  D_prob = discriminator(Hat_X, H)
  
  ## GAIN loss
  D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \
                                + (1-M) * tf.log(1. - D_prob + 1e-8)) 
  
  G_loss_temp = -tf.reduce_mean((1-M) * tf.log(D_prob + 1e-8))
  
  MSE_loss = \
  tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)
  
  D_loss = D_loss_temp
  G_loss = G_loss_temp + alpha * MSE_loss 
  
  ## GAIN solver
  D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
  G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)
  
  ## Iterations
  sess = tf.Session()
  sess.run(tf.global_variables_initializer())
   
  # Start Iterations
  for it in tqdm(range(iterations)):    
      
    # Sample batch
    batch_idx = sample_batch_index(no, batch_size)
    X_mb = norm_data_x[batch_idx, :]  
    M_mb = data_m[batch_idx, :]  
    # Sample random vectors  
    Z_mb = uniform_sampler(0, 0.01, batch_size, dim) 
    # Sample hint vectors
    H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
    H_mb = M_mb * H_mb_temp
      
    # Combine random vectors with observed vectors
    X_mb = M_mb * X_mb + (1-M_mb) * Z_mb 
      
    _, D_loss_curr = sess.run([D_solver, D_loss_temp], 
                              feed_dict = {M: M_mb, X: X_mb, H: H_mb})
    _, G_loss_curr, MSE_loss_curr = \
    sess.run([G_solver, G_loss_temp, MSE_loss],
             feed_dict = {X: X_mb, M: M_mb, H: H_mb})
            
  ## Return imputed data      
  Z_mb = uniform_sampler(0, 0.01, no, dim) 
  M_mb = data_m
  X_mb = norm_data_x          
  X_mb = M_mb * X_mb + (1-M_mb) * Z_mb 
      
  imputed_data = sess.run([G_sample], feed_dict = {X: X_mb, M: M_mb})[0]
  
  imputed_data = data_m * norm_data_x + (1-data_m) * imputed_data
  
  # Renormalization
  imputed_data = renormalization(imputed_data, norm_parameters)  
  
  # Rounding
  imputed_data = rounding(imputed_data, data_x)  
          
  return imputed_data

In [42]:
import pandas as pd
import numpy as np

# Load the CSV file
data = pd.read_csv('1.csv')
data = data.drop(['encounter_id', 'hospital_id'], axis=1)
data['gender'] = data['gender'].astype('category').cat.codes
categorical_columns = ['ethnicity', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']
data = pd.get_dummies(data, columns=categorical_columns)
data = data.applymap(lambda x: int(x) if isinstance(x, bool) else x)

# Remove rows with no missing values and save them as test data
clean_data = data.dropna()
clean_data.to_csv('test_data_no_missing_values.csv', index=False)

# Find the rest of the data by excluding the clean_data
rest_data = data.merge(clean_data.drop_duplicates(), on=data.columns.tolist(), 
                       how='left', indicator=True).query('_merge == "left_only"').drop('_merge', axis=1)

# Save the rest of the data as training data
rest_data.to_csv('training_data.csv', index=False)

In [7]:
import pandas as pd
import numpy as np
data = pd.read_csv('training_data.csv')
target = data['diabetes_mellitus']  
features = data.drop(columns=['diabetes_mellitus']) 

data_x = features.to_numpy()

# Set the GAIN parameters
gain_parameters = {
    'batch_size': 128,
    'hint_rate': 0.9,
    'alpha': 100,
    'iterations': 10000
}

# Impute missing data using GAIN
imputed_data = gain(data_x, gain_parameters)

# Convert the imputed data back to a pandas DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=features.columns)

# Reattach the target variable
imputed_df['diabetes_mellitus'] = target

# Save the imputed dataset to a new CSV file
imputed_df.to_csv('training_data_imputed.csv', index=False)


2024-08-24 16:10:25.865380: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-08-24 16:10:25.865400: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-08-24 16:10:25.878031: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
  0%|          | 0/10000 [00:00<?, ?it/s]2024-08-24 16:10:25.978396: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-08-24 16:10:26.115845: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_ty

: 