In [10]:
import numpy as np
import sys
import tensorflow as tf
from keras.losses import Loss
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
import pandas as pd
import datetime
import time
import io
from sklearn.model_selection import train_test_split
import scipy.io as sio

In [11]:
def load_data(dataset, kappa, exp_id, original_X=False, extra_str=""):
    path = '/content/drive/MyDrive/Causal_network_matching/data/'+dataset+str(kappa)+'/'+str(dataset)+''+str(exp_id)+'.mat'
    print(path)
    data = sio.loadmat(path)
    A = data['Network']  # csr matrix

    if not original_X:
        X = data['X_100']
    else:
        X = data['Attributes']

    mu_1 = data['Y1']
    mu_0 = data['Y0']
    T = data['T']

    T = T.flatten()
    mu_1 = mu_1.flatten()
    mu_0 = mu_0.flatten()

    Y_observed = []
    for i in range(len(T)):
        if T[i] == 1:
            Y_observed.append(mu_1[i])
        else:
            Y_observed.append(mu_0[i])

    Y_observed = np.array(Y_observed)
    X = X.todense()


    X_train, X_test, Y_factual_train, _, T_train, _, mu_0_train, mu_0_test, mu_1_train, mu_1_test = train_test_split(X, Y_observed, T, mu_0, mu_1, test_size=0.2)

    X_train = X_train.astype('float32')
    Y_factual_train = Y_factual_train.astype('float32') #most GPUs only compute 32-bit floats
    T_train = T_train.astype('float32')
    mu_0_train = mu_0_train.astype('float32')
    mu_1_train = mu_1_train.astype('float32')

    X_test = X_test.astype('float32')
    mu_0_test = mu_0_test.astype('float32')
    mu_1_test = mu_1_test.astype('float32')

    data_train={'x':X_train,'y':Y_factual_train,'t':T_train,'mu_0':mu_0_train,'mu_1':mu_1_train}
    data_train['t']=data_train['t'].reshape(-1,1) #we're just padding one dimensional vectors with an additional dimension
    data_train['y']=data_train['y'].reshape(-1,1)
    data_train['mu_0']=data_train['mu_0'].reshape(-1,1) #we're just padding one dimensional vectors with an additional dimension
    data_train['mu_1']=data_train['mu_1'].reshape(-1,1)

    #rescaling y between 0 and 1 often makes training of DL regressors easier
    data_train['y_scaler'] = StandardScaler().fit(data_train['y'])
    data_train['ys'] = data_train['y_scaler'].transform(data_train['y'])

    data_test={'x':X_test,'mu_0':mu_0_test,'mu_1':mu_1_test}
    data_test['y_scaler'] = data_train['y_scaler']
    data_test['mu_0']=data_test['mu_0'].reshape(-1,1) #we're just padding one dimensional vectors with an additional dimension
    data_test['mu_1']=data_test['mu_1'].reshape(-1,1)

    return data_train, data_test

In [12]:
def pdist2sq(x,y):
    x2 = tf.reduce_sum(x ** 2, axis=-1, keepdims=True)
    y2 = tf.reduce_sum(y ** 2, axis=-1, keepdims=True)
    dist = x2 + tf.transpose(y2, (1, 0)) - 2. * x @ tf.transpose(y, (1, 0))
    return dist

class CFRNet_Loss(Loss):
  #initialize instance attributes
  def __init__(self, alpha=1.,sigma=1.):
      super().__init__()
      self.alpha = alpha # balances regression loss and MMD IPM
      self.rbf_sigma=sigma #for gaussian kernel
      self.name='cfrnet_loss'

  def split_pred(self,concat_pred):
      #generic helper to make sure we dont make mistakes
      preds={}
      preds['y0_pred'] = concat_pred[:, 0]
      preds['y1_pred'] = concat_pred[:, 1]
      preds['phi'] = concat_pred[:, 2:]
      return preds

  def rbf_kernel(self, x, y):
    return tf.exp(-pdist2sq(x,y)/tf.square(self.rbf_sigma))

  def calc_mmdsq(self, Phi, t):
    Phic, Phit =tf.dynamic_partition(Phi,tf.cast(tf.squeeze(t),tf.int32),2)

    Kcc = self.rbf_kernel(Phic,Phic)
    Kct = self.rbf_kernel(Phic,Phit)
    Ktt = self.rbf_kernel(Phit,Phit)

    m = tf.cast(tf.shape(Phic)[0],Phi.dtype)
    n = tf.cast(tf.shape(Phit)[0],Phi.dtype)

    mmd = 1.0/(m*(m-1.0))*(tf.reduce_sum(Kcc))
    mmd = mmd + 1.0/(n*(n-1.0))*(tf.reduce_sum(Ktt))
    mmd = mmd - 2.0/(m*n)*tf.reduce_sum(Kct)
    return mmd * tf.ones_like(t)

  def mmdsq_loss(self, concat_true,concat_pred):
    t_true = concat_true[:, 1]
    p=self.split_pred(concat_pred)
    mmdsq_loss = tf.reduce_mean(self.calc_mmdsq(p['phi'],t_true))
    return mmdsq_loss

  def regression_loss(self,concat_true,concat_pred):
      y_true = concat_true[:, 0]
      t_true = concat_true[:, 1]
      p = self.split_pred(concat_pred)
      loss0 = tf.reduce_mean((1. - t_true) * tf.square(y_true - p['y0_pred']))
      loss1 = tf.reduce_mean(t_true * tf.square(y_true - p['y1_pred']))
      return loss0+loss1

  def cfr_loss(self,concat_true,concat_pred):
      lossR = self.regression_loss(concat_true,concat_pred)
      lossIPM = self.mmdsq_loss(concat_true,concat_pred)
      return lossR + self.alpha * lossIPM

      #return lossR + self.alpha * lossIPM

  #compute loss
  def call(self, concat_true, concat_pred):
      return self.cfr_loss(concat_true,concat_pred)

In [13]:
import tensorflow as tf
import numpy as np

from keras.layers import Input
from keras.layers import Dense
from keras.layers import Concatenate
from keras import regularizers
from keras import Model
from keras.losses import binary_crossentropy
from keras.metrics import binary_accuracy
from keras.losses import Loss

def make_tarnet(input_dim, reg_l2):

    x = Input(shape=(input_dim,), name='input')

    # representation
    phi = Dense(units=512, activation='elu', kernel_initializer='RandomNormal',name='phi_1')(x)
    phi = Dense(units=256, activation='elu', kernel_initializer='RandomNormal',name='phi_2')(phi)

    # HYPOTHESIS
    y0_hidden = Dense(units=256, activation='elu', kernel_regularizer=regularizers.l2(reg_l2),name='y0_hidden_1')(phi)
    y1_hidden = Dense(units=128, activation='elu', kernel_regularizer=regularizers.l2(reg_l2),name='y1_hidden_1')(phi)

    # second layer
    y0_hidden = Dense(units=256, activation='elu', kernel_regularizer=regularizers.l2(reg_l2),name='y0_hidden_2')(y0_hidden)
    y1_hidden = Dense(units=128, activation='elu', kernel_regularizer=regularizers.l2(reg_l2),name='y1_hidden_2')(y1_hidden)

    # third
    y0_predictions = Dense(units=1, activation=None, kernel_regularizer=regularizers.l2(reg_l2), name='y0_predictions')(y0_hidden)
    y1_predictions = Dense(units=1, activation=None, kernel_regularizer=regularizers.l2(reg_l2), name='y1_predictions')(y1_hidden)

    concat_pred = Concatenate(1)([y0_predictions, y1_predictions,phi])
    model = Model(inputs=x, outputs=concat_pred)

    return model

In [14]:
#https://towardsdatascience.com/implementing-macro-f1-score-in-keras-what-not-to-do-e9f1aa04029d
class Eval_metrics_train():
    def __init__(self,data):
        self.data=data #feed the callback the full dataset
        #needed for PEHEnn; Called in self.find_ynn
        self.data['o_idx']=tf.range(self.data['t'].shape[0])
        self.data['c_idx']=self.data['o_idx'][self.data['t'].squeeze()==0] #These are the indices of the control units
        self.data['t_idx']=self.data['o_idx'][self.data['t'].squeeze()==1] #These are the indices of the treated units

    def split_pred(self,concat_pred):
        preds={}
        preds['y0_pred'] = self.data['y_scaler'].inverse_transform(concat_pred[:, 0].reshape(-1, 1))
        preds['y1_pred'] = self.data['y_scaler'].inverse_transform(concat_pred[:, 1].reshape(-1, 1))
        preds['t_pred'] = concat_pred[:, 2]
        preds['epsilon'] = concat_pred[:, 3]
        preds['phi'] = concat_pred[:, 4:]
        return preds

    def ATE_absolute_error(self,concat_pred):
        p = self.split_pred(concat_pred)
        ATT_pred = tf.gather(params=self.data['y'], indices=self.data['t_idx']) - tf.gather(params=p['y0_pred'], indices=self.data['t_idx'])
        ATU_pred = tf.gather(params=p['y1_pred'], indices=self.data['c_idx']) - tf.gather(params=self.data['y'], indices=self.data['c_idx'])
        ATE_pred = tf.reduce_mean(tf.concat([ATT_pred,ATU_pred], axis=0)) #stitch em back up!
        ATE_actual = tf.reduce_mean(self.data['mu_1']-self.data['mu_0'])
        return tf.abs(ATE_actual- ATE_pred)

    def ITE_RMSE_error(self,concat_pred):
        #simulation only
        p = self.split_pred(concat_pred)
        y_1_treated_group = tf.gather(params=self.data['y'], indices=self.data['t_idx'])
        y_0_treated_group = tf.gather(params=p['y0_pred'], indices=self.data['t_idx'])

        mu_1_treated_group = tf.gather(params=self.data['mu_1'], indices=self.data['t_idx'])
        mu_0_treated_group = tf.gather(params=self.data['mu_0'], indices=self.data['t_idx'])


        treat_grp_error = (y_1_treated_group - y_0_treated_group) - (mu_1_treated_group - mu_0_treated_group)

        y_1_control_group = tf.gather(params=p['y1_pred'], indices=self.data['c_idx'])
        y_0_control_group = tf.gather(params=self.data['y'], indices=self.data['c_idx'])

        mu_1_control_group = tf.gather(params=self.data['mu_1'], indices=self.data['c_idx'])
        mu_0_control_group = tf.gather(params=self.data['mu_0'], indices=self.data['c_idx'])

        control_grp_error = (y_1_control_group - y_0_control_group) - (mu_1_control_group - mu_0_control_group)


        ITE_error = tf.concat([treat_grp_error, control_grp_error], axis=0)
        ITE_RMSE_error = tf.sqrt(tf.reduce_mean(tf.square(ITE_error)))
        return ITE_RMSE_error

In [15]:
#https://towardsdatascience.com/implementing-macro-f1-score-in-keras-what-not-to-do-e9f1aa04029d
class Eval_metrics_test():
    def __init__(self,data):
        self.data=data #feed the callback the full dataset

    def split_pred(self,concat_pred):
        preds={}
        preds['y0_pred'] = self.data['y_scaler'].inverse_transform(concat_pred[:, 0].reshape(-1, 1))
        preds['y1_pred'] = self.data['y_scaler'].inverse_transform(concat_pred[:, 1].reshape(-1, 1))
        preds['t_pred'] = concat_pred[:, 2]
        preds['epsilon'] = concat_pred[:, 3]
        preds['phi'] = concat_pred[:, 4:]
        return preds


    def PEHE(self,concat_pred):
        #simulation only
        p = self.split_pred(concat_pred)
        cate_err=tf.reduce_mean( tf.square( ( (self.data['mu_1']-self.data['mu_0']) - (p['y1_pred']-p['y0_pred']) ) ) )
        return tf.sqrt(cate_err)

    def ATE_absolute_error_outsample(self,concat_pred):
        #simulation only
        p = self.split_pred(concat_pred)
        ATE_actual = tf.reduce_mean(self.data['mu_1']-self.data['mu_0'])
        ATE_pred = tf.reduce_mean(p['y1_pred']-p['y0_pred'])
        return tf.abs(ATE_actual- ATE_pred)

In [26]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau, TerminateOnNaN
from tensorflow.keras.optimizers import SGD, Adam
import io
from IPython.display import display, clear_output
#Colab command to allow us to run Colab in TF2
%load_ext tensorboard
# data=get_news_data(1)
val_split=0.20
batch_size=4000
verbose=1
i = 0
tf.random.set_seed(i)
np.random.seed(i)
!rm -rf ./logs_CFRnet/
sim_evals = []
for j in range(0,10):
    clear_output(wait=True)
    print(j+1)
    data_train, data_test = load_data(dataset="Flickr", kappa=2, exp_id=j)
    yt = np.concatenate([data_train['ys'], data_train['t']], 1)

    # Clear any logs from previous runs
    start_time = time.time()
    log_dir = "logs_CFRnet/fit/" +str(j)+"/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    file_writer = tf.summary.create_file_writer(log_dir + "/metrics")
    file_writer.set_as_default()
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=0)

    adam_callbacks = [
        TerminateOnNaN(),
        EarlyStopping(monitor='val_loss', patience=50, min_delta=0.),
        ReduceLROnPlateau(monitor='loss', factor=0.5, patience=100, verbose=verbose, mode='auto',
                          min_delta=1e-8, cooldown=0, min_lr=0),
        tensorboard_callback
    ]


    cfrnet_model=make_tarnet(data_train['x'].shape[1],.01)
    cfrnet_loss=CFRNet_Loss(alpha=1.0)

    # Flickr
    cfrnet_model.compile(optimizer=Adam(learning_rate=0.001),
                         loss=cfrnet_loss,
                         metrics=[cfrnet_loss,cfrnet_loss.regression_loss,cfrnet_loss.mmdsq_loss])

    # # BlogCatalog
    # cfrnet_model.compile(optimizer=Adam(learning_rate=0.005),
    #                      loss=cfrnet_loss,
    #                      metrics=[cfrnet_loss,cfrnet_loss.regression_loss,cfrnet_loss.mmdsq_loss])

    cfrnet_model.fit(x=data_train['x'],y=yt,
                     callbacks=adam_callbacks,
                     validation_split=val_split,
                     epochs=10000,
                     batch_size=batch_size,
                     verbose=0)

    end_time = time.time()

    elapsed_time_seconds = end_time - start_time
    # Convert elapsed time to minutes
    elapsed_time_minutes = elapsed_time_seconds / 60

    Evaluation_metrics_insample = Eval_metrics_train(data_train)
    concat_pred_insample = cfrnet_model.predict(data_train['x'])
    ATE_abs_insample = Evaluation_metrics_insample.ATE_absolute_error(concat_pred_insample)
    ITE_RMSE_insample = Evaluation_metrics_insample.ITE_RMSE_error(concat_pred_insample)

    Evaluation_metrics_outsample = Eval_metrics_test(data_test)
    concat_pred_outsample = cfrnet_model.predict(data_test['x'])
    ATE_abs_outsample = Evaluation_metrics_outsample.ATE_absolute_error_outsample(concat_pred_outsample)
    PEHE_outsample = Evaluation_metrics_outsample.PEHE(concat_pred_outsample)

    metrics = [ATE_abs_insample.numpy(), ITE_RMSE_insample.numpy(), ATE_abs_outsample.numpy(), PEHE_outsample.numpy(), elapsed_time_minutes]
    sim_evals.append(metrics)


10
/content/drive/MyDrive/Causal_network_matching/data/Flickr2/Flickr9.mat
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [27]:
sim_evals = np.asarray(sim_evals)

print(" Insample ATE error for CFR-net(mean over 10) =", round(np.mean(sim_evals[:,0]),2),"+-",round((np.std(sim_evals[:,0], ddof=1) / np.sqrt(np.size(sim_evals[:,0]))),2))
print(" Insample ITE_RMSE error for CFR-net(mean over 10) =", round(np.mean(sim_evals[:,1]),2),"+-",round((np.std(sim_evals[:,1], ddof=1) / np.sqrt(np.size(sim_evals[:,1]))),2))
print(" Outsample ATE error for CFR-net(mean over 10) =", round(np.mean(sim_evals[:,2]),2),"+-",round((np.std(sim_evals[:,2], ddof=1) / np.sqrt(np.size(sim_evals[:,2]))),2))
print(" Outsample PEHE error for CFR-net(mean over 10) =", round(np.mean(sim_evals[:,3]),2),"+-",round((np.std(sim_evals[:,3], ddof=1) / np.sqrt(np.size(sim_evals[:,3]))),2))
print(" Wall time for CFR-net(sum over 10) =", np.sum(sim_evals[:,4]))

 Insample ATE error for CFR-net(mean over 10) = 6.73 +- 1.36
 Insample ITE_RMSE error for CFR-net(mean over 10) = 75.01 +- 4.11
 Outsample ATE error for CFR-net(mean over 10) = 8.45 +- 2.05
 Outsample PEHE error for CFR-net(mean over 10) = 93.04 +- 7.07
 Wall time for CFR-net(sum over 10) = 4.911457216739654
