using GAN to impute data on training set, test set is non-missing values only

In [12]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()


import numpy as np
from tqdm import tqdm

from utils import normalization, renormalization, rounding
from utils import xavier_init
from utils import binary_sampler, uniform_sampler, sample_batch_index


def gain (data_x, gain_parameters,model_path):
  '''Impute missing values in data_x
  
  Args:
    - data_x: original data with missing values
    - gain_parameters: GAIN network parameters:
      - batch_size: Batch size
      - hint_rate: Hint rate
      - alpha: Hyperparameter
      - iterations: Iterations
      
  Returns:
    - imputed_data: imputed data
  '''
  # Define mask matrix
  data_m = 1-np.isnan(data_x)
  
  # System parameters
  batch_size = gain_parameters['batch_size']
  hint_rate = gain_parameters['hint_rate']
  alpha = gain_parameters['alpha']
  iterations = gain_parameters['iterations']
  
  # Other parameters
  no, dim = data_x.shape
  
  # Hidden state dimensions
  h_dim = int(dim)
  
  # Normalization
  norm_data, norm_parameters = normalization(data_x)
  norm_data_x = np.nan_to_num(norm_data, 0)
  
  ## GAIN architecture   
  # Input placeholders
  # Data vector
  X = tf.placeholder(tf.float32, shape = [None, dim])
  # Mask vector 
  M = tf.placeholder(tf.float32, shape = [None, dim])
  # Hint vector
  H = tf.placeholder(tf.float32, shape = [None, dim])
  
  # Discriminator variables
  D_W1 = tf.Variable(xavier_init([dim*2, h_dim])) # Data + Hint as inputs
  D_b1 = tf.Variable(tf.zeros(shape = [h_dim]))
  
  D_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
  D_b2 = tf.Variable(tf.zeros(shape = [h_dim]))
  
  D_W3 = tf.Variable(xavier_init([h_dim, dim]))
  D_b3 = tf.Variable(tf.zeros(shape = [dim]))  # Multi-variate outputs
  
  theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]
  
  #Generator variables
  # Data + Mask as inputs (Random noise is in missing components)
  G_W1 = tf.Variable(xavier_init([dim*2, h_dim]))  
  G_b1 = tf.Variable(tf.zeros(shape = [h_dim]))
  
  G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
  G_b2 = tf.Variable(tf.zeros(shape = [h_dim]))
  
  G_W3 = tf.Variable(xavier_init([h_dim, dim]))
  G_b3 = tf.Variable(tf.zeros(shape = [dim]))
  
  theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]
  
  ## GAIN functions
  # Generator
  def generator(x,m):
    # Concatenate Mask and Data
    inputs = tf.concat(values = [x, m], axis = 1) 
    G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
    G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)   
    # MinMax normalized output
    G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) 
    return G_prob
      
  # Discriminator
  def discriminator(x, h):
    # Concatenate Data and Hint
    inputs = tf.concat(values = [x, h], axis = 1) 
    D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)  
    D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
    D_logit = tf.matmul(D_h2, D_W3) + D_b3
    D_prob = tf.nn.sigmoid(D_logit)
    return D_prob
  
  ## GAIN structure
  # Generator
  G_sample = generator(X, M)
 
  # Combine with observed data
  Hat_X = X * M + G_sample * (1-M)
  
  # Discriminator
  D_prob = discriminator(Hat_X, H)
  
  ## GAIN loss
  D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \
                                + (1-M) * tf.log(1. - D_prob + 1e-8)) 
  
  G_loss_temp = -tf.reduce_mean((1-M) * tf.log(D_prob + 1e-8))
  
  MSE_loss = \
  tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)
  
  D_loss = D_loss_temp
  G_loss = G_loss_temp + alpha * MSE_loss 
  
  ## GAIN solver
  D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
  G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)

  saver = tf.train.Saver()
  
  ## Iterations
  sess = tf.Session()
  sess.run(tf.global_variables_initializer())
   
  # Start Iterations
  for it in tqdm(range(iterations)):    
      
    # Sample batch
    batch_idx = sample_batch_index(no, batch_size)
    X_mb = norm_data_x[batch_idx, :]  
    M_mb = data_m[batch_idx, :]  
    # Sample random vectors  
    Z_mb = uniform_sampler(0, 0.01, batch_size, dim) 
    # Sample hint vectors
    H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
    H_mb = M_mb * H_mb_temp
      
    # Combine random vectors with observed vectors
    X_mb = M_mb * X_mb + (1-M_mb) * Z_mb 
      
    _, D_loss_curr = sess.run([D_solver, D_loss_temp], 
                              feed_dict = {M: M_mb, X: X_mb, H: H_mb})
    _, G_loss_curr, MSE_loss_curr = \
    sess.run([G_solver, G_loss_temp, MSE_loss],
             feed_dict = {X: X_mb, M: M_mb, H: H_mb})
  saver.save(sess, model_path)
            
  ## Return imputed data      
  Z_mb = uniform_sampler(0, 0.01, no, dim) 
  M_mb = data_m
  X_mb = norm_data_x          
  X_mb = M_mb * X_mb + (1-M_mb) * Z_mb 
      
  imputed_data = sess.run([G_sample], feed_dict = {X: X_mb, M: M_mb})[0]
  
  imputed_data = data_m * norm_data_x + (1-data_m) * imputed_data
  
  # Renormalization
  imputed_data = renormalization(imputed_data, norm_parameters)  
  
  # Rounding
  imputed_data = rounding(imputed_data, data_x)  
          
  return imputed_data, norm_parameters

In [19]:
def impute_with_trained_model(test_data, model_path, norm_parameters):
    # Dimensions of data
    no, dim = test_data.shape
    h_dim = int(dim)
    
    # Rebuild the graph
    tf.reset_default_graph()

    # Define placeholders
    X = tf.placeholder(tf.float32, shape=[None, dim])
    M = tf.placeholder(tf.float32, shape=[None, dim])

    # Redefine the generator and discriminator with exact architecture from training
    # Generator variables
    G_W1 = tf.Variable(xavier_init([dim*2, h_dim]))  # Dimensions need to be defined or imported
    G_b1 = tf.Variable(tf.zeros(shape=[h_dim]))
    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    G_b2 = tf.Variable(tf.zeros(shape=[h_dim]))
    G_W3 = tf.Variable(xavier_init([h_dim, dim]))
    G_b3 = tf.Variable(tf.zeros(shape=[dim]))

    # Redefine generator function
    def generator(x, m):
        inputs = tf.concat(values=[x, m], axis=1) 
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)   
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) 
        return G_prob
    

    # Initialize the TensorFlow Saver object
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # Restore the model weights
        saver.restore(sess, model_path)
        
        # Normalize test data
        epsilon = 1e-10
        norm_data_x = np.nan_to_num((test_data - norm_parameters['min_val']) / (norm_parameters['max_val'] - norm_parameters['min_val']+epsilon), 0)
        data_m = 1-np.isnan(test_data)
        
        # Impute missing data
        Z_mb = uniform_sampler(0, 0.01, test_data.shape[0], dim)
        X_mb = data_m * norm_data_x + (1 - data_m) * Z_mb
        
        G_sample = generator(X, M)
        imputed_data = sess.run(G_sample, feed_dict={X: X_mb, M: data_m})
        final_data = np.where(np.isnan(test_data), imputed_data, test_data)
        
        final_data = renormalization(final_data, norm_parameters)
        final_data = rounding(final_data, test_data)

    return final_data


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load and preprocess data
df = pd.read_csv('1.csv')
df.drop(['hospital_id'], axis=1, inplace=True)

# Select features and target columns
X = df.drop(['diabetes_mellitus', 'encounter_id'], axis=1)  # Dropping the target and identifier from the features
y = df[['diabetes_mellitus', 'encounter_id']] 

X['gender'] = X['gender'].astype('category').cat.codes
categorical_columns = ['ethnicity', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']
X = pd.get_dummies(X, columns=categorical_columns)
X = X.applymap(lambda x: int(x) if isinstance(x, bool) else x)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

gain_parameters = {
    'batch_size': 128,
    'hint_rate': 0.9,
    'alpha': 100,
    'iterations': 1
}

# Convert to numpy array for GAIN
train_data = X_train.to_numpy()

# Path to save model weights
model_path = 'trained_model.ckpt'

imputed_data, norm_parameters = gain(train_data, gain_parameters, model_path)

imputed_df = pd.DataFrame(data=imputed_data, columns=X_train.columns)

# Reset index to align the indices
imputed_df.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

# Add the target and identifier columns back
imputed_df['diabetes_mellitus'] = y_train['diabetes_mellitus']
imputed_df['encounter_id'] = y_train['encounter_id']

# Save the DataFrame with added target column to a CSV file
imputed_df.to_csv('imputed_data_train.csv', index=False)

2024-08-24 15:49:05.532555: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-08-24 15:49:05.532572: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-08-24 15:49:05.541394: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
  0%|          | 0/1 [00:00<?, ?it/s]2024-08-24 15:49:05.613346: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-08-24 15:49:05.745116: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type G

In [21]:
imputed_test_data = impute_with_trained_model(X_test.to_numpy(), model_path, norm_parameters)
imputed_test_df = pd.DataFrame(data=imputed_test_data, columns=X_test.columns)
imputed_test_df.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
# Add the target and identifier columns back
imputed_test_df['diabetes_mellitus'] = y_test['diabetes_mellitus']
imputed_test_df['encounter_id'] = y_test['encounter_id']

# Save the DataFrame with added target column to a CSV file
imputed_test_df.to_csv('imputed_data_test.csv', index=False)

INFO:tensorflow:Restoring parameters from trained_model.ckpt


2024-08-24 15:49:12.488295: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-08-24 15:49:12.488318: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-08-24 15:49:12.491075: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-08-24 15:49:12.650337: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
