In [1]:
import keras.backend as K
import tensorflow as tf
import math
import numpy as np
import pandas as pd
from tensorflow.keras import regularizers
from tensorflow.keras import layers
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from tensorflow.keras.layers import Input, Dense, Reshape, Concatenate, Layer, Dropout
from tensorflow.keras.layers import BatchNormalization, Activation, Embedding, Flatten,LeakyReLU,ReLU
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import RMSprop, Adam
from functools import partial
from gumbel_softmax import GumbelSoftmax

from IPython.core.interactiveshell import InteractiveShell
pd.options.display.max_rows = 2000

In [2]:
gpu = tf.config.experimental.get_visible_devices('GPU')[0]
tf.config.experimental.set_memory_growth(device = gpu, enable = True)

# Load the data

In [3]:
y_train_SC_seq = np.load('Data/y_train_SC_seq.npy',allow_pickle=True)
y_test_SC_seq = np.load('Data/y_test_SC_seq.npy',allow_pickle=True)
y_train_seq = np.load('Data/y_train_seq.npy',allow_pickle=True)
y_test_seq = np.load('Data/y_test_seq.npy',allow_pickle=True)

y_train_nseq= pd.read_csv('Data/y_train_nseq.csv')
y_test_nseq= pd.read_csv('Data/y_test_nseq.csv')
y_train_SC_nseq= pd.read_csv('Data/y_train_SC_nseq.csv')
y_test_SC_nseq= pd.read_csv('Data/y_test_SC_nseq.csv')
x_train_cond= pd.read_csv('Data/x_train_cond.csv')
x_test_cond= pd.read_csv('Data/x_test_cond.csv')

## Qualitative attrubutes in the raw small-scale complete data
x_train_cond_R = pd.read_csv('Data/train_complete_qualitative.csv')

# Data preprocesing

* Define function and parameters

In [4]:
## Make Ground Truth & Test
n_uni_col = [x_train_cond_R[i].nunique() for i in x_train_cond_R.columns[1:7]]
n_uni_col = [0]+n_uni_col+[6,6,6,6,6]
n_uni_col = np.cumsum(n_uni_col)
col_pop = x_test_cond.columns
BATCH_SIZE = 256

n_col = [x_train_cond_R[i].nunique() for i in x_train_cond_R.columns[1:7]]
n_col = n_col+[6,6,6,6,6]
emb_col = [4,2,2,9,2,4,4,4,4,4,4]


def wide_to_long(samples_pop):
    resamples = []
    for j in range(samples_pop.shape[0]):
        if(type(samples_pop) is np.ndarray):
            sam = samples_pop[j]
        else:
            sam = samples_pop.values[j]
        resamples_row = []
        for i in range(len(n_uni_col)-1):
            idx = range(n_uni_col[i],n_uni_col[i+1])
            resamples_row = np.append(resamples_row,np.random.choice(col_pop[idx],p=sam[idx],size=1))
        resamples = np.concatenate((resamples,resamples_row),axis=0)
    resamples = resamples.reshape(samples_pop.shape[0],len(n_uni_col)-1 )
    resamples = pd.DataFrame(resamples,columns= x_train_cond_R.columns[1:7].to_list()+["TP_0","TP_1","TP_2","TP_3","TP_4"])
    resamples = resamples.apply(lambda x: x.astype('category'))
    return(resamples)

* Transform the data into Label form

In [5]:
## Training Embedding Networks
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

x_Bert_train = wide_to_long(x_train_cond)
x_Bert_test = wide_to_long(x_test_cond)


def Multi_LabelEncoder(x_Bert_train,x_Bert_test):
    output_train = x_Bert_train.copy()
    output_test = x_Bert_test.copy()
    for col in x_Bert_test.columns:
        LE = LabelEncoder().fit(x_Bert_train[col])
        output_train[col] = LE.transform(x_Bert_train[col])
        output_test[col] = LE.transform(x_Bert_test[col])
    
    return output_train,output_test

x_Bert_train_lab,x_Bert_test_lab = Multi_LabelEncoder(x_Bert_train,x_Bert_test)

* Define the function for BERT networks

In [6]:
MAX_LEN = 11 #256
VOCAB_SIZE = 16 #30000


def get_masked_input_and_labels(x_Bert_train_lab):
       
    x_Bert_train_lab = np.array(x_Bert_train_lab)
    # 15% BERT masking
    inp_mask = np.random.rand(*x_Bert_train_lab.shape) < 0.15
    # Set targets to -1 by default, it means ignore
    labels = -1 * np.ones(x_Bert_train_lab.shape, dtype=int)
    # Set labels for masked tokens
    labels[inp_mask] = x_Bert_train_lab[inp_mask]

    # Prepare input
    x_Bert_train_lab_masked = np.copy(x_Bert_train_lab)
    
    # This means leaving 10% unchanged
    inp_mask_2mask = inp_mask & (np.random.rand(*x_Bert_train_lab.shape) < 0.90)
    x_Bert_train_lab_masked[
        inp_mask_2mask
    ] = 16  # mask token is the last in the dict
    
    for i in range(x_Bert_train_lab_masked.shape[1]):
        idx = x_Bert_train_lab_masked[:,i] == 16
        x_Bert_train_lab_masked[:,i][idx] = (n_col[i]-1)


    # Prepare sample_weights to pass to .fit() method
    sample_weights = np.ones(labels.shape)
    sample_weights[labels == -1] = 0

    # y_labels would be same as encoded_texts i.e input tokens
    y_labels = np.copy(x_Bert_train_lab)

    return x_Bert_train_lab_masked, y_labels, sample_weights

# Modeling

* Define the Masked Language Model (BERT)

In [7]:
# Define Masked Language Model
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
    reduction=tf.keras.losses.Reduction.NONE
)
loss_tracker = tf.keras.metrics.Mean(name="loss")


class MaskedLanguageModel(tf.keras.Model):
    def train_step(self, inputs):
        if len(inputs) == 3:
            features, labels, sample_weight = inputs
        else:
            features, labels = inputs
            sample_weight = None

        with tf.GradientTape() as tape:
            predictions = self(features, training=True)
            loss = loss_fn(labels, predictions, sample_weight=sample_weight)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Compute our own metrics
        loss_tracker.update_state(loss, sample_weight=sample_weight)

        # Return a dict mapping metric names to current value
        return {"loss": loss_tracker.result()}

    @property
    def metrics(self):
        # We list our `Metric` objects here so that `reset_states()` can be
        # called automatically at the start of each epoch
        # or at the start of `evaluate()`.
        # If you don't implement this property, you have to call
        # `reset_states()` yourself at the time of your choosing.
        return [loss_tracker]

* Create the BERT model

In [8]:
def create_embed_model():
    
    inputs = []
    embeddings = []
    for i in range(x_Bert_train_lab.shape[1]):
        k_embedding_input = Input((1,),dtype=tf.int64)
        inputs.append(k_embedding_input)

        k_embedding = Embedding(input_dim = n_col[i],
                                output_dim = emb_col[i],
                                name = 'embedding'+'_'+np.str(i))(k_embedding_input)
        embeddings.append(k_embedding)
        

    embeddings = Concatenate()(embeddings)

    dense1 = Dense(32,activation='relu')(embeddings)
    dense2 = Dense(32,activation='relu')(dense1)
   
    outputs = []
    for i in range(x_Bert_train_lab.shape[1]):
        k_output = Dense(n_col[i],activation='softmax',name='out'+np.str(i))(dense2)
        outputs.append(k_output)
    
    mlm_model = tf.keras.Model(inputs,outputs,name="masked_model")
    optimizer = tf.keras.optimizers.Adam()
    mlm_model.compile(optimizer=optimizer,loss="SparseCategoricalCrossentropy")
    
    return mlm_model

#bert_masked_model = create_masked_language_bert_model()
bert_masked_model = create_embed_model()

* Training the BERT model

# Evaluate the BERT model (Meaningless)

In [10]:
from sklearn import metrics
x_masked_test, y_masked_labels, sample_weights = get_masked_input_and_labels(
    x_Bert_test_lab
)

x_test = [x_masked_test.transpose()[i] for i in range(11)]
y_labels = [y_masked_labels.transpose()[i] for i in range(11)]
weights = [sample_weights.transpose()[i] for i in range(11)]





# Load pretrained bert model
mlm_model = tf.keras.models.load_model(
    "MLM_Embed_indiv.h5")
gen_imgs = mlm_model.predict(x_test)

for i in range(len(gen_imgs)):
    gen_imgs[i] = np.reshape(gen_imgs[i],(6005,-1))

gen_imgs = np.concatenate(gen_imgs,axis=1)
resamples = wide_to_long(gen_imgs)


# Evaluation for Home_income
roc_auc_list = []
for i in range(6):
    r1 = gen_imgs[:,range(n_uni_col[i],n_uni_col[i+1])]
    r1 = r1[weights[i]==1]
    s1 = x_Bert_test.iloc[:,i][weights[i]==1].cat.codes
    if s1.max() > 1:
        roc_auc_list.append(metrics.roc_auc_score(s1,r1,multi_class="ovr"))
    else:
        roc_auc_list.append(metrics.roc_auc_score(s1,r1[:,1]))
print(np.mean(roc_auc_list))

0.7252875014053181


In [11]:
# Load pretrained bert model
mlm_model = tf.keras.models.load_model(
    "MLM_Embed_indiv.h5")

embedding_layers =  mlm_model.layers[11:22]
samples = np.array(x_test_cond.copy())


def convert_to_embedding(samples):
    samples_emb = []
    for i in range(len(embedding_layers)):
        emb_weight = embedding_layers[i].get_weights()[0]
        trgt = samples[:,range(n_uni_col[i],n_uni_col[i+1])]
        samples_emb.append(np.dot(trgt,emb_weight))
    
    return(np.concatenate(samples_emb,axis=1))

samples_emb = convert_to_embedding(samples)