# Set up

In [1]:
import torch
import gpytorch
import pandas as pd
import numpy as np
import tqdm as tqdm
from linear_operator import settings

import pyro
import math
import pickle
import time
from joblib import Parallel, delayed

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import pyro.distributions as dist
from pyro.infer import MCMC, NUTS
import arviz as az
import seaborn as sns

import os

from torch.utils.data import TensorDataset, DataLoader
import itertools

In [2]:
import GP_functions.Loss_function as Loss_function
import GP_functions.bound as bound
import GP_functions.Estimation as Estimation
import GP_functions.Training as Training
import GP_functions.Prediction as Prediction
import GP_functions.GP_models as GP_models
import GP_functions.Tools as Tools
import GP_functions.FeatureE as FeatureE

# Data

In [3]:
X_train = pd.read_csv('Data/X_train.csv', header=None, delimiter=',').values
X_test = pd.read_csv('Data/X_test.csv', header=None, delimiter=',').values

Y_train_21 = pd.read_csv('Data/Y_train_std_21.csv', header=None, delimiter=',').values
Y_test_21 = pd.read_csv('Data/Y_test_std_21.csv', header=None, delimiter=',').values

Y_train_std = pd.read_csv('Data/Y_train_std.csv', header=None, delimiter=',').values
Y_test_std = pd.read_csv('Data/Y_test_std.csv', header=None, delimiter=',').values


train_x = torch.tensor(X_train, dtype=torch.float32)
test_x = torch.tensor(X_test, dtype=torch.float32)

train_y_21 = torch.tensor(Y_train_21, dtype=torch.float32)
test_y_21 = torch.tensor(Y_test_21, dtype=torch.float32)

# train_y = torch.tensor(Y_train_std, dtype=torch.float32)
# test_y = torch.tensor(Y_test_std, dtype=torch.float32)


# torch.set_default_dtype(torch.float32)

# Model

In [None]:
class DGPHiddenLayer(gpytorch.models.deep_gps.DeepGPLayer):
    def __init__(
        self,
        input_dims,
        output_dims,
        num_inducing = 512,
        covar_type = "RBF",
        linear_mean = False,
        train_x_for_init = None
    ):
        self.input_dims = input_dims
        self.output_dims = output_dims
        batch_shape = torch.Size([output_dims])

        if train_x_for_init is not None:
            idx = torch.randperm(train_x_for_init.size(0))[:num_inducing]
            inducing_points = train_x_for_init[idx].clone()
            inducing_points = inducing_points.unsqueeze(0).expand(
                output_dims, -1, -1
            )  # B x M x D
        else:
            inducing_points = (
                torch.rand(output_dims, num_inducing, input_dims) * 4.9 + 0.1
            )

        variational_dist = gpytorch.variational.CholeskyVariationalDistribution(
            num_inducing_points=num_inducing,
            batch_shape=batch_shape,
        )
        variational_strategy = gpytorch.variational.VariationalStrategy(
            self,
            inducing_points,
            variational_dist,
            learn_inducing_locations=True,
        )

        super().__init__(variational_strategy, input_dims, output_dims)
        
        self.mean_module = gpytorch.means.ZeroMean() if linear_mean else gpytorch.means.LinearMean(input_dims)
        
        if covar_type == 'Matern5/2':
            base_kernel = gpytorch.kernels.MaternKernel(nu=2.5,
                                                        batch_shape=batch_shape,
                                                        ard_num_dims=input_dims)
        elif covar_type == 'RBF':
            base_kernel = gpytorch.kernels.RBFKernel(batch_shape=batch_shape,
                                                     ard_num_dims=input_dims)
        elif covar_type == 'Matern3/2':
            base_kernel = gpytorch.kernels.MaternKernel(nu=1.5,
                                                        batch_shape=batch_shape,
                                                        ard_num_dims=input_dims)
        elif covar_type == 'RQ':
            base_kernel = gpytorch.kernels.RQKernel(batch_shape=batch_shape,
                                                    ard_num_dims=input_dims)
        elif covar_type == 'PiecewisePolynomial':
            base_kernel = gpytorch.kernels.PiecewisePolynomialKernel(q=2,
                                                                     batch_shape=batch_shape,
                                                                     ard_num_dims=input_dims)
        else:
            raise ValueError("RBF, Matern5/2, Matern3/2, RQ, PiecewisePolynomial")
        
        self.covar_module = gpytorch.kernels.ScaleKernel(base_kernel,
                                                         batch_shape=batch_shape, 
                                                         ard_num_dims=None)
    
    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
    


class DeepGP2(gpytorch.models.deep_gps.DeepGP):
    def __init__(
        self,
        train_x,
        train_y,
        hidden_dim = 4,
        inducing_num = 512,
        covar_types = ["RBF", "RBF"],
    ):
        num_tasks = train_y.size(-1)

        layer1 = DGPHiddenLayer(
            input_dims=train_x.size(-1),
            output_dims=hidden_dim,
            num_inducing=inducing_num,
            covar_type=covar_types[0],
            train_x_for_init=train_x,
        )
        layer2 = DGPHiddenLayer(
            input_dims=hidden_dim,
            output_dims=num_tasks,
            num_inducing=inducing_num,
            covar_type=covar_types[1],
            linear_mean=True,
            train_x_for_init=train_x,
        )

        super().__init__()
        self.layers = torch.nn.ModuleList([layer1, layer2])
        self.likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=num_tasks)

    def forward(self, x):
        x = self.layers[0](x)
        return self.layers[1](x)
    
    def predict(self, test_x):
        # with gpytorch.settings.fast_pred_var():
        preds = self.likelihood(self(test_x)).to_data_independent_dist()

        return preds.mean.mean(0).squeeze(), preds.variance.mean(0).squeeze()

In [None]:
def train_dgp_minibatch(
    train_x,
    train_y,
    hidden_dim = 4,
    inducing_num = 512,
    num_iterations = 3000,
    patience = 100,
    batch_size = 256,
    eval_every = 200,
    eval_batch_size = 1024,
    lr = 0.05,
    device = "cuda"
):
    train_x, train_y = train_x.to(device), train_y.to(device)

    model = DeepGP2(
        train_x, train_y, hidden_dim, inducing_num
    ).to(device)

    model.train()


    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    mll = gpytorch.mlls.DeepApproximateMLL(
        gpytorch.mlls.VariationalELBO(
            likelihood=model.likelihood, model=model, num_data=train_y.size(0)
        )
    )


    best_loss = float("inf")
    best_state = model.state_dict()
    no_improve = 0

    loader = itertools.cycle(
        DataLoader(TensorDataset(train_x, train_y), batch_size, shuffle=True)
    )

    # --- jitter ---
    jitter_ctx = gpytorch.settings.variational_cholesky_jitter(1e-3)

    with tqdm.tqdm(total=num_iterations, desc="Training DGP") as pbar, jitter_ctx:
        for step in range(num_iterations):
            x_batch, y_batch = next(loader)
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            output = model(x_batch)
            loss = -mll(output, y_batch)
            loss.backward()
            optimizer.step()

            if (step + 1) % eval_every == 0:
                model.eval()
                with torch.no_grad(), gpytorch.settings.fast_pred_var():
                    total_loss = 0.0
                    for i in range(0, train_x.size(0), eval_batch_size):
                        xb, yb = (
                            train_x[i : i + eval_batch_size],
                            train_y[i : i + eval_batch_size],
                        )
                        out = model(xb)
                        total_loss += -mll(out, yb).item() * yb.size(0)
                full_loss = total_loss / train_x.size(0)
                pbar.set_postfix(loss=f"{full_loss:.4f}")
                model.train()

                if full_loss < best_loss - 1e-4:
                    best_loss, best_state, no_improve = full_loss, model.state_dict(), 0
                else:
                    no_improve += 1
                    if no_improve >= patience:
                        print("Early stopping")
                        break
            pbar.update(1)

    model.load_state_dict(best_state)
    model.eval()
    return model

In [None]:
dgp_model= Training.train_dgp_minibatch(GP_models.DeepGP2, 
                                train_x, train_y_21,
                                hidden_dim = 10,
                                inducing_num = 300,
                                covar_types = ["RQ", "RBF"], 
                                num_iterations = 5000,
                                patience = 100,
                                batch_size = 256,
                                eval_every = 100,
                                eval_batch_size = 1024,
                                lr = 0.05,
                                device = "cuda")

In [None]:
checkpoint = {
    'model_state_dict': dgp_model.state_dict(),
    'model_params': {
        'num_hidden_dgp_dims': 10,
        'inducing_num': 300,
        'covar_types': ["RQ", "RBF"],
        'input_dim': train_x.size(1),
        'output_dim': train_y_21.size(1)
    }
}


save_path = 'final_dgp_2_checkpoint_21.pth'
torch.save(checkpoint, save_path)
print(f"save {save_path}")

# Read Model

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_x, train_y_21 = train_x.to(device), train_y_21.to(device)

ckpt_path = 'final_dgp_2_checkpoint_21.pth'
checkpoint = torch.load(ckpt_path, map_location=device)

state_dict   = checkpoint['model_state_dict']
model_params = checkpoint['model_params']

dgp_model = GP_models.DeepGP2(
    train_x, train_y_21, 
    hidden_dim = model_params['num_hidden_dgp_dims'], 
    inducing_num = model_params['inducing_num'], 
    covar_types = model_params['covar_types']
).to(device)

dgp_model.load_state_dict(state_dict, strict=False)   # strict=False 可防止版本微调导致的非关键键报错
dgp_model.eval()
dgp_model.likelihood.eval()

MultitaskGaussianLikelihood(
  (raw_task_noises_constraint): GreaterThan(1.000E-04)
  (raw_noise_constraint): GreaterThan(1.000E-04)
)

In [19]:
test_x = test_x.to('cuda')
test_y_21 = test_y_21.to('cuda')

In [6]:
dgp_model.eval()
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    mean, var = dgp_model.predict(test_x[0,:].unsqueeze(0))

  summing_matrix = cls(summing_matrix_indices, summing_matrix_values, size)
  summing_matrix = cls(summing_matrix_indices, summing_matrix_values, size)


In [14]:
dgp_model.predict(test_x[0,:].unsqueeze(0))

(tensor([-0.8652,  4.0993, -0.2189,  1.5201,  1.0206,  1.3828, -0.2697,  0.0336,
         -0.3237,  0.0910,  0.0044,  0.1388, -0.0638, -0.2080,  0.1221, -0.1245,
         -0.0471, -0.0145,  0.0510,  0.0481,  0.0224], device='cuda:0',
        grad_fn=<SqueezeBackward0>),
 tensor([0.0031, 0.0172, 0.0184, 0.0142, 0.0106, 0.0079, 0.0027, 0.0026, 0.0030,
         0.0024, 0.0023, 0.0027, 0.0025, 0.0025, 0.0025, 0.0020, 0.0025, 0.0021,
         0.0024, 0.0020, 0.0019], device='cuda:0', grad_fn=<SqueezeBackward0>))

In [8]:
test_y_21[0]

tensor([-0.9226,  4.2240, -0.2954,  1.5937,  1.0363,  1.3768, -0.2562,  0.0340,
        -0.3592,  0.1005,  0.0101,  0.1750, -0.0758, -0.2362,  0.1707, -0.1471,
        -0.0780, -0.0288,  0.0638,  0.0508,  0.0267])

In [None]:
(dgp_model.likelihood(dgp_model(test_x[0,:].unsqueeze(0))).to_data_independent_dist().covariance_matrix).shape

In [None]:
dgp_model.likelihood(dgp_model(test_x[0,:].unsqueeze(0)))

In [None]:
dgp_model.likelihood(dgp_model(test_x[0,:].unsqueeze(0))).to_data_independent_dist().covariance_matrix

In [15]:
def dgp_predict(dgpmodel, x, mc=10):
    with gpytorch.settings.fast_pred_var(), gpytorch.settings.num_likelihood_samples(mc):
        dist = dgpmodel.likelihood(dgpmodel(x)).to_data_independent_dist()
    mean = dist.mean.mean(0).squeeze(0)
    var  = dist.variance.mean(0).squeeze(0)
    return pyro.distributions.Independent(
               pyro.distributions.Normal(mean, var.sqrt()), 1)

In [16]:
def dgp_predict_cov(dgpmodel,
               x,
               num_mc: int = 10,
               jitter: float = 1e-6,
               device="cuda"):

    x = x.to(device)

    with gpytorch.settings.fast_pred_var(), gpytorch.settings.num_likelihood_samples(num_mc):
        post = dgpmodel.likelihood(dgpmodel(x)).to_data_independent_dist()
        # mean : [num_mc, 1, T]
        # cov_matrix : [num_mc, 1, T, T]
    mean_mc = post.mean.squeeze(1)                     # [num_mc, T]
    cov_mc  = post.covariance_matrix.squeeze(1)        # [num_mc, T, T]

    # 2. Moment matching across MC dimension
    mu_bar = mean_mc.mean(dim=0)                       # [T]
    centered = mean_mc - mu_bar                        # [num_mc, T]
    cov_mu   = (centered.unsqueeze(2)                  # [num_mc, T, 1]
                @ centered.unsqueeze(1))               # [num_mc, 1, T] → [num_mc, T, T]

    Sigma_bar = cov_mc.mean(dim=0) + cov_mu.mean(dim=0)  # [T, T]

    Sigma_bar = Sigma_bar + jitter * torch.eye(Sigma_bar.size(0),
                                               device=Sigma_bar.device)

    return pyro.distributions.MultivariateNormal(mu_bar, covariance_matrix=Sigma_bar)


In [None]:
dgp_predict(dgp_model, test_x[0,:].unsqueeze(0), mc=10)

In [None]:
dist = dgp_predict_cov(dgp_model, test_x[0,:].unsqueeze(0), num_mc=10)

In [None]:
dist.covariance_matrix.shape

In [42]:
def run_mcmc_Uniform_dgp(Pre_function, Models, row_idx, test_y, bounds, num_sampling=2000, warmup_step=1000, num_chains=1, device='cuda'):
    test_y = test_y.to(dtype=torch.float32, device=device)
    
    bounds = [
        (
            torch.tensor(b[0], dtype=torch.float32, device=device),
            torch.tensor(b[1], dtype=torch.float32, device=device)
        ) for b in bounds
    ]
    
    def model():
        params = []
        for i, (min_val, max_val) in enumerate(bounds):
            param_i = pyro.sample(f'param_{i}', dist.Uniform(min_val, max_val))
            params.append(param_i)
        
        theta = torch.stack(params)
        
        gp_pred = Pre_function(Models, theta.unsqueeze(0))
        
        y_obs = test_y[row_idx, :]
        pyro.sample('obs', gp_pred, obs=y_obs)
    
    nuts_kernel = pyro.infer.mcmc.RandomWalkKernel(model)
    mcmc = MCMC(nuts_kernel, num_samples=num_sampling, warmup_steps=warmup_step, num_chains=num_chains)
    
    mcmc.run()
    
    return mcmc

In [43]:
row_idx = 0

input_point = test_y_21[row_idx, :]
local_train_x, local_train_y = Tools.find_k_nearest_neighbors_GPU(input_point, train_x, train_y_21, k=100)
bounds = bound.get_bounds(local_train_x)

mcmc_result_Uniform = run_mcmc_Uniform_dgp(dgp_predict_cov, dgp_model, 
                                           row_idx, test_y_21, bounds, 
                                           num_sampling = 120, warmup_step = 12)


AttributeError: module 'pyro.infer.mcmc' has no attribute 'RandomWalkKernel'

In [29]:
idata = az.from_pyro(mcmc_result_Uniform)
az.plot_trace(idata)
plt.show()


summary = az.summary(idata, hdi_prob=0.95)
print(summary)

AttributeError: module 'numpy' has no attribute 'bool'.
`np.bool` was a deprecated alias for the builtin `bool`. To avoid this error in existing code, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations