## Load the Data

In [1]:
import sys
sys.path.append("/vol/bitbucket/ad6013/Research/gp-causal/data")

In [2]:
from pairs.generate_pairs import TubingenPairs

In [7]:
data_gen = TubingenPairs(path='/vol/bitbucket/ad6013/Research/gp-causal/data/pairs/files')

Load cause-effect pairs: 100%|██████████| 100/100 [00:00<00:00, 236.47it/s]


In [8]:
x, y, weight = [], [], []
for i in data_gen.pairs_generator():
    x.append(i[0])
    y.append(i[1])
    weight.append(i[2])

NameError: name 'f' is not defined

## Plot the data

Check for linearity etc.

In [5]:
import matplotlib.pyplot as plt
import os

In [None]:
savepath = "/Users/anish.dhir/Documents/Research/gp_causal/tuebingen_plots"
if not os.path.exists(savepath):
    os.makedirs(savepath)

In [None]:
for i in range(len(x)):
    if x[i].shape[-1] == 1:
        plt.scatter(x[i], y[i])
        plt.title(f"Plot {i}")
        plt.savefig(f"{savepath}/Tuebingen: {i}")
        plt.clf()

## Try and train a GP

The basic premise here is to use the properties of the marginal likelihood to try and compare the the complexities of cause|effect vs. effect|cause

In [2]:
import gpflow
import tensorflow as tf


rng = np.random.RandomState(0)
tf.random.set_seed(0)

TypeError: Type subscription requires python >= 3.9

In [11]:
import gpytorch


class LogKernel(gpytorch.kernels.Kernel):
    is_stationary = True

    def forward(self, x1, x2, **params):
        diff = self.covar_dist(x1, x2, square_dist=True, **params)
        diff.where(diff == 0, torch.as_tensor(1e-20))
        kern = torch.log(diff + 1)
        return kern

In [12]:
from gpytorch.models import ApproximateGP
from gpytorch.variational import CholeskyVariationalDistribution
from gpytorch.variational import VariationalStrategy


class GPModel(ApproximateGP):
    def __init__(self, inducing_points):
        variational_distribution = CholeskyVariationalDistribution(inducing_points.size(0))
        variational_strategy = VariationalStrategy(self, inducing_points, variational_distribution, learn_inducing_locations=True)
        super(GPModel, self).__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=2.5)) + gpytorch.kernels.ScaleKernel(gpytorch.kernels.RQKernel())
        # self.covar_module = gpytorch.kernels.ScaleKernel(LogKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

In [13]:
import torch
from tqdm import trange
# from sklearn.cluster import KMeans


# Find optimal model hyperparameters
def train(x, y, num_inducing):
    if len(x) < num_inducing + 1:
        inducing = x_train
    else:
        # kmeans = KMeans(n_clusters=num_inducing).fit(x_train)
        # inducing = kmeans.cluster_centers_
        inducing_idx = np.random.choice(x_train.size(0), replace=False)
        inducing = np.take(x_train, inducing_idx, axis=0).float()
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPModel(inducing)
    # Set initial lengthscale value
    init_lengthscale = 1
    init_scale = 1
    for kern_idx in range(len(model.covar_module.kernels) - 1 ):
        model.covar_module.kernels[kern_idx].base_kernel.lengthscale = init_lengthscale
        model.covar_module.kernels[kern_idx].outputscale = init_scale
    likelihood.noise = 2.
    model.train().to(DEVICE)
    likelihood.train().to(DEVICE)
    # Use the adam optimizer
    optimizer = torch.optim.Adam([
        {'params': model.parameters()},
        {'params': likelihood.parameters()},  # Includes GaussianLikelihood parameters
    ], lr=0.01)

    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=y.size(0))

    training_iter = 10000
    loss_list = []
    t = trange(training_iter, desc="Running Model", leave=True, position=0)
    for i in t:
        # Zero gradients from previous iteration
        optimizer.zero_grad()
        # Output from model
        output = model(x)
        # Calc loss and backprop gradients
        loss = - mll(output, y[:, 0])
        loss.backward()
        loss_list.append(loss.item())
        optimizer.step()
        if i > 1000:
            # Need a convergence criteria
            if np.abs(np.mean(loss_list[-10:]) - np.mean(loss_list[-50:-10])) < np.std(loss_list[-50:-10]):
                break
        if i % 25 == 0:
            t.set_description(f"Loss: {loss}")
            t.refresh()
    return loss_list

The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.


In [14]:
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import StandardScaler
import numpy as np

np.random.seed(0)
torch.manual_seed(0)

correct_idx = []
wrong_idx = []
num_inducing = 500

for i in tqdm(range(len(x)), desc="Epochs", leave=True, position=0):
    print(f'\n {i}')
    # Ignore the high dim
    if x[i].shape[-1] > 1:
        continue
    else:
        # Get data points
        x_train, y_train, weight_train = x[i], y[i], weight[i]
    # Make sure data is standardised 
    x_train = StandardScaler().fit_transform(x_train)
    y_train = StandardScaler().fit_transform(y_train)
    x_train, y_train = torch.from_numpy(x_train.astype(float)).float().to(DEVICE), torch.from_numpy(y_train.astype(float)).float().to(DEVICE)
    # x -> y score
    # Draw uniform 
    unif_samples = torch.rand(
        x_train.size(0), 1, device=DEVICE
    )
    loss_x = train(x=unif_samples, y=x_train, num_inducing=num_inducing)
    loss_x_y = train(x=x_train, y=y_train, num_inducing=num_inducing)
    # x <- y score
    # Draw uniform 
    unif_samples = torch.rand(
        y_train.size(0), 1, device=DEVICE
    )
    loss_y = train(x=unif_samples, y=y_train, num_inducing=num_inducing)
    loss_y_x = train(x=y_train, y=x_train, num_inducing=num_inducing)
    # Calculate losses
    score_x_y = loss_x[-1] + loss_x_y[-1]
    score_y_x = loss_y[-1] + loss_y_x[-1]
    if score_x_y < score_y_x:
        correct_idx.append(i)
    else:
        wrong_idx.append(i)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(len(x)), desc="Epochs", leave=True, position=0):


Epochs:   0%|          | 0/100 [00:00<?, ?it/s]


 0


Running Model:   0%|          | 0/10000 [00:00<?, ?it/s]


NotPSDError: Matrix not positive definite after repeatedly adding jitter up to 1.0e-06.

In [None]:
correct_weight = [weight[i] for i in correct_idx]
wrong_weight = [weight[i] for i in wrong_idx]

In [None]:
accuracy = np.sum(correct_weight) / (np.sum(correct_weight) + np.sum(wrong_weight))

In [None]:
accuracy

In [None]:
correct_idx

In [None]:
wrong_idx

In [27]:
!pip3 install scikit-learn


OSError: "/usr/local/bin/fish" shell not found