In [None]:
import sys
import torch
import scipy
import numpy as np
import scipy.io as sio
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from sklearn.metrics import pairwise_distances
from scipy.sparse import coo_matrix

In [None]:
def load_data(dataset, kappa, exp_id, original_X=False, extra_str=""):
    path = '/content/drive/MyDrive/Causal_network_matching/data/'+dataset+str(kappa)+'/'+str(dataset)+''+str(exp_id)+'.mat'
    print(path)
    data = sio.loadmat(path)
    A = data['Network']  # csr matrix

    if not original_X:
        X = data['X_100']
    else:
        X = data['Attributes']

    mu_1 = data['Y1']
    mu_0 = data['Y0']
    T = data['T']


    T = T[0]
    mu_1 = mu_1[0]
    mu_0 = mu_0[0]

    Y_observed = np.where(T > 0, mu_1, mu_0)

    X = X.todense()
    X = np.array(X, dtype=np.float32)
    A = A.todense()
    A = np.array(A, dtype=np.float32)

    row_sums = X.sum(axis=1)
    row_sums[row_sums == 0.0] = 1.0
    X = X / row_sums[:, np.newaxis]

    return X, A, T, Y_observed, mu_1, mu_0


In [None]:
def create_hash(features, hash_length):
  random_A = torch.randn(features.size(1), hash_length)
  r = torch.sparse.mm(features, random_A)
  r = (r > 0).float()
  r = convert_binary_to_bipolar(r)
  return r

def convert_binary_to_bipolar(HD_vecs):
  return (2 * HD_vecs) -1

def get_separate_treated_and_control(Q,T):
  Q_treated = Q[T.astype(bool)]
  Q_control = Q[np.logical_not(T.astype(bool))]
  return Q_treated, Q_control

In [None]:
def get_latent_HD_reprensentation(X,A, hash_dim):
  X = torch.from_numpy(X)
  A = torch.from_numpy(A)
  adj_sparse_torch = A.to_sparse()

  N = create_hash(X.to_sparse(), hash_length=hash_dim)
  H_1 = torch.sparse.mm(adj_sparse_torch, N)
  H_2 = torch.sparse.mm(adj_sparse_torch, H_1)

  phi_0 = torch.randint(0, 2, size=(1,hash_dim))
  phi_0 = convert_binary_to_bipolar(phi_0)
  phi_0 = torch.tile(phi_0, (H_1.size()[0], 1))

  phi_1 = torch.randint(0, 2, size=(1,hash_dim))
  phi_1 = convert_binary_to_bipolar(phi_1)
  phi_1 = torch.tile(phi_1, (H_1.size()[0], 1))

  phi_2 = torch.randint(0, 2, size=(1,hash_dim))
  phi_2 = convert_binary_to_bipolar(phi_2)
  phi_2 = torch.tile(phi_2, (H_1.size()[0], 1))

  Z = torch.mul(N, phi_0) + torch.mul(H_1, phi_1) + torch.mul(H_2, phi_2)
  Z = np.array(Z, dtype=np.float32)

  return Z

In [None]:
def do_matching(covariates, treatment_assignment, outcomes):

  # print(covariates.shape)
  # print(treatment_assignment.shape)
  # print(outcomes.shape)
  # print("--------------")

  X_trtd, X_cntrl = get_separate_treated_and_control(covariates, treatment_assignment)
  Y_trtd, Y_cntrl = get_separate_treated_and_control(outcomes, treatment_assignment)

  hash_length = X.shape[1]

  from sklearn.neighbors import KNeighborsRegressor
  y1_predictor = KNeighborsRegressor(n_neighbors=5, weights='distance')
  y1_predictor.fit(X_trtd, Y_trtd)

  y0_predictor = KNeighborsRegressor(n_neighbors=5, weights='distance')
  y0_predictor.fit(X_cntrl, Y_cntrl)


  return y1_predictor, y0_predictor


def evaluate(Y1_predictions, Y0_predictions, mu_1_actual, mu_0_actual):

  ATE_error = np.abs(np.mean(Y1_predictions-Y0_predictions) - np.mean(mu_1_actual-mu_0_actual))
  ITE_RMSE = np.sqrt(np.mean(np.square((Y1_predictions-Y0_predictions) - (mu_1_actual - mu_0_actual))))

  return ITE_RMSE, ATE_error

In [None]:
PEHE_in_list = []
ATE_in_list = []

PEHE_out_list = []
ATE_out_list = []

elapsed_time_list = []
sim_evals = []

import time

# Record the start time
start_time = time.time()
for i in range(10):
  X, A, T, Y_observed, mu_1, mu_0 = load_data(dataset="BlogCatalog", kappa="_random", exp_id=i)


  # print(T.shape)
  # print(np.sum(T))

  n = X.shape[0]
  n_train = int(n * 0.8)

  # print(n)
  # print(n_train)

  idx = np.random.permutation(n)
  idx_train, idx_test = idx[:n_train], idx[n_train:]
  print(X.shape)

  Z = get_latent_HD_reprensentation(X,A, hash_dim=10000)



  Y1_model, Y0_model = do_matching(Z[idx_train], T[idx_train], Y_observed[idx_train])

  Y_1pred = Y1_model.predict(Z)
  Y_0pred = Y0_model.predict(Z)

  # print(Y_1pred[idx_train].shape)
  # print(Y_0pred[idx_train].shape)
  # print(mu_1[idx_train].shape)
  # print(mu_0[idx_train].shape)

  PEHE_in, ATE_in = evaluate(Y_1pred[idx_train], Y_0pred[idx_train], mu_1[idx_train], mu_0[idx_train])

  PEHE_out, ATE_out = evaluate(Y_1pred[idx_test], Y_0pred[idx_test], mu_1[idx_test], mu_0[idx_test])

  PEHE_in_list.append(PEHE_in)
  ATE_in_list.append(ATE_in)

  PEHE_out_list.append(PEHE_out)
  ATE_out_list.append(ATE_out)

end_time = time.time()
elapsed_time = (end_time - start_time)/10

/content/drive/MyDrive/Causal_network_matching/data/BlogCatalog_random/BlogCatalog0.mat
(5196, 2182)
/content/drive/MyDrive/Causal_network_matching/data/BlogCatalog_random/BlogCatalog1.mat
(5196, 2182)
/content/drive/MyDrive/Causal_network_matching/data/BlogCatalog_random/BlogCatalog2.mat
(5196, 2182)
/content/drive/MyDrive/Causal_network_matching/data/BlogCatalog_random/BlogCatalog3.mat
(5196, 2182)
/content/drive/MyDrive/Causal_network_matching/data/BlogCatalog_random/BlogCatalog4.mat
(5196, 2182)
/content/drive/MyDrive/Causal_network_matching/data/BlogCatalog_random/BlogCatalog5.mat
(5196, 2182)
/content/drive/MyDrive/Causal_network_matching/data/BlogCatalog_random/BlogCatalog6.mat
(5196, 2182)
/content/drive/MyDrive/Causal_network_matching/data/BlogCatalog_random/BlogCatalog7.mat
(5196, 2182)
/content/drive/MyDrive/Causal_network_matching/data/BlogCatalog_random/BlogCatalog8.mat
(5196, 2182)
/content/drive/MyDrive/Causal_network_matching/data/BlogCatalog_random/BlogCatalog9.mat
(51

In [None]:
print(" Insample ATE error for HD-net(mean over 10) =", round(np.mean(ATE_in_list),2),"+-",round((np.std(ATE_in_list, ddof=1) / np.sqrt(np.size(ATE_in_list))),2))
print(" Insample ITE_RMSE error for HD-net(mean over 10) =", round(np.mean(PEHE_in_list),2),"+-",round((np.std(PEHE_in_list, ddof=1) / np.sqrt(np.size(PEHE_in_list))),2))
print(" Outsample ATE error for HD-net(mean over 10) =", round(np.mean(ATE_out_list),2),"+-",round((np.std(ATE_out_list, ddof=1) / np.sqrt(np.size(ATE_out_list))),2))
print(" Outsample PEHE error for HD-net(mean over 10) =", round(np.mean(PEHE_out_list),2),"+-",round((np.std(PEHE_out_list, ddof=1) / np.sqrt(np.size(PEHE_out_list))),2))
print(" Wall time in seconds for HD-net for 1 simulation(avg over 10) =", elapsed_time)

 Insample ATE error for HD-net(mean over 10) = 0.27 +- 0.06
 Insample ITE_RMSE error for HD-net(mean over 10) = 1.87 +- 0.03
 Outsample ATE error for HD-net(mean over 10) = 0.2 +- 0.05
 Outsample PEHE error for HD-net(mean over 10) = 1.73 +- 0.14
 Wall time in seconds for HD-net for 1 simulation(avg over 10) = 7.817298746109008
