# Set up

In [13]:
import torch
import gpytorch
import pandas as pd
import numpy as np
import tqdm as tqdm
from linear_operator import settings

import pyro
import math
import pickle
import time
from joblib import Parallel, delayed

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import pyro.distributions as dist
from pyro.infer import MCMC, NUTS
import arviz as az
import seaborn as sns

import os

from torch.utils.data import TensorDataset, DataLoader
import itertools

In [14]:
import GP_functions.Loss_function as Loss_function
import GP_functions.bound as bound
import GP_functions.Estimation as Estimation
import GP_functions.Training as Training
import GP_functions.Prediction as Prediction
import GP_functions.GP_models as GP_models
import GP_functions.Tools as Tools
import GP_functions.FeatureE as FeatureE

# Data

In [15]:
X_train = pd.read_csv('Data/X_train.csv', header=None, delimiter=',').values
X_test = pd.read_csv('Data/X_test.csv', header=None, delimiter=',').values

Y_train_21 = pd.read_csv('Data/Y_train_std_21.csv', header=None, delimiter=',').values
Y_test_21 = pd.read_csv('Data/Y_test_std_21.csv', header=None, delimiter=',').values

Y_train_std = pd.read_csv('Data/Y_train_std.csv', header=None, delimiter=',').values
Y_test_std = pd.read_csv('Data/Y_test_std.csv', header=None, delimiter=',').values


train_x = torch.tensor(X_train, dtype=torch.float32)
test_x = torch.tensor(X_test, dtype=torch.float32)

train_y_21 = torch.tensor(Y_train_21, dtype=torch.float32)
test_y_21 = torch.tensor(Y_test_21, dtype=torch.float32)

# train_y = torch.tensor(Y_train_std, dtype=torch.float32)
# test_y = torch.tensor(Y_test_std, dtype=torch.float32)


# torch.set_default_dtype(torch.float32)

# Model

In [None]:
class DGPHiddenLayer(gpytorch.models.deep_gps.DeepGPLayer):
    def __init__(self, input_dims, output_dims, num_inducing=500, linear_mean=True,
                 covar_type='RBF'):

        inducing_points = torch.rand(output_dims, num_inducing, input_dims) * (5 - 0.1) + 0.1
        # inducing_points = train_x[:num_inducing].unsqueeze(0).expand(output_dims, -1, -1).contiguous()

        batch_shape = torch.Size([output_dims])
        
        variational_distribution = gpytorch.variational.NaturalVariationalDistribution(
            num_inducing_points=num_inducing,
            batch_shape=batch_shape
        )
        
        variational_strategy = gpytorch.variational.VariationalStrategy(
            self,
            inducing_points,
            variational_distribution,
            learn_inducing_locations=True
        )
        
        super().__init__(variational_strategy, input_dims, output_dims)
        
        self.mean_module = gpytorch.means.ZeroMean() if linear_mean else gpytorch.means.LinearMean(input_dims)
        
        # 根据 covar_type 选择对应的核函数
        if covar_type == 'Matern5/2':
            base_kernel = gpytorch.kernels.MaternKernel(nu=2.5,
                                                        batch_shape=batch_shape,
                                                        ard_num_dims=input_dims)
        elif covar_type == 'RBF':
            base_kernel = gpytorch.kernels.RBFKernel(batch_shape=batch_shape,
                                                     ard_num_dims=input_dims)
        elif covar_type == 'Matern3/2':
            base_kernel = gpytorch.kernels.MaternKernel(nu=1.5,
                                                        batch_shape=batch_shape,
                                                        ard_num_dims=input_dims)
        elif covar_type == 'RQ':
            base_kernel = gpytorch.kernels.RQKernel(batch_shape=batch_shape,
                                                    ard_num_dims=input_dims)
        elif covar_type == 'PiecewisePolynomial':
            base_kernel = gpytorch.kernels.PiecewisePolynomialKernel(q=2,
                                                                     batch_shape=batch_shape,
                                                                     ard_num_dims=input_dims)
        else:
            raise ValueError("RBF, Matern5/2, Matern3/2, RQ, PiecewisePolynomial")
        
        self.covar_module = gpytorch.kernels.ScaleKernel(base_kernel,
                                                         batch_shape=batch_shape, 
                                                         ard_num_dims=None)
    
    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
    


class DeepGP_2(gpytorch.models.deep_gps.DeepGP):
    def __init__(self, train_x_shape, train_y, num_hidden_dgp_dims = 4, inducing_num = 500, covar_types = ['RBF','Matern3/2']):
        num_tasks = train_y.size(-1)
 
        hidden_layer_1 = DGPHiddenLayer(
            input_dims=train_x_shape[-1],
            output_dims=num_hidden_dgp_dims,
            num_inducing=inducing_num, 
            linear_mean=True,
            covar_type=covar_types[0]
        )


        last_layer = DGPHiddenLayer(
            input_dims=hidden_layer_1.output_dims,
            output_dims = num_tasks,
            num_inducing=inducing_num, 
            linear_mean=False,
            covar_type=covar_types[1]
        )

        super().__init__()

        self.hidden_layer_1 = hidden_layer_1
        self.last_layer = last_layer

        # We're going to use a ultitask likelihood instead of the standard GaussianLikelihood
        self.likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=num_tasks)

    def forward(self, inputs):
        hidden_rep1 = self.hidden_layer_1(inputs)
        output = self.last_layer(hidden_rep1)
        return output
    
    def predict(self, test_x):
        # with torch.no_grad():
        preds = self.likelihood(self(test_x)).to_data_independent_dist()

        return preds.mean.mean(0).squeeze(), preds.variance.mean(0).squeeze()

In [17]:
def train_DGP_minibatch(
    full_train_x, 
    full_train_y, 
    DGP_model,
    num_hidden_dgp_dims=4, 
    inducing_num=500, 
    num_iterations=2000, 
    patience=50, 
    device='cuda',
    batch_size=32,
    eval_every=100,
    eval_batch_size=1024,
    lr_variational = 0.1,
    lr=0.05
):
    """
    训练Deep GP (2层) 的完整流程，支持小批量训练、早停、全数据集评估和学习率调度。
    
    参数说明：
    - full_train_x, full_train_y: 训练数据
    - num_hidden_dgp_dims: Deep GP中隐藏层维度
    - inducing_num: 每层诱导点数量
    - num_iterations: 总迭代次数上限
    - patience: 早停耐心值 (评估损失连续多少次不下降就停止)
    - device: 'cpu' 或 'cuda'
    - batch_size: 小批量训练时的批量大小
    - eval_every: 每隔多少次迭代进行一次全数据评估
    - eval_batch_size: 进行全数据评估时的批量大小
    - lr: 初始学习率
    """

    full_train_x = full_train_x.to(device)
    full_train_y = full_train_y.to(device)


    model = DGP_model(
        full_train_x.shape, 
        full_train_y, 
        num_hidden_dgp_dims, 
        inducing_num
    ).to(device)

    model.train()

    variational_ngd_optimizer = gpytorch.optim.NGD(
        model.variational_parameters(),
        num_data=full_train_y.size(0),
        lr=lr_variational
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    mll = gpytorch.mlls.DeepApproximateMLL(
        gpytorch.mlls.VariationalELBO(
            model.likelihood,
            model,
            num_data=full_train_y.size(0)
        )
    )

    best_loss = float('inf')
    best_state = model.state_dict()
    counter = 0


    data_loader = DataLoader(
        TensorDataset(full_train_x, full_train_y),
        batch_size=batch_size,
        shuffle=True
    )
    minibatch_iter = itertools.cycle(data_loader)


    with tqdm.tqdm(total=num_iterations, desc="Training DGP") as pbar:
        for step in range(num_iterations):
            x_batch, y_batch = next(minibatch_iter)
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)

            variational_ngd_optimizer.zero_grad()
            output = model(x_batch)
            loss = -mll(output, y_batch)
            loss.backward()
            variational_ngd_optimizer.step()

            optimizer.zero_grad()
            output = model(x_batch)
            loss = -mll(output, y_batch)
            loss.backward()
            optimizer.step()

            if (step + 1) % eval_every == 0 or (step == num_iterations - 1):
                current_loss = Training.evaluate_full_dataset_loss_dgp(
                    model=model,
                    x_data=full_train_x,
                    y_data=full_train_y,
                    mll=mll,
                    device=device,
                    batch_size=eval_batch_size
                )
                pbar.set_postfix(full_loss=current_loss)
                

                if current_loss < best_loss:
                    best_loss = current_loss
                    best_state = model.state_dict()
                    counter = 0
                else:
                    counter += 1
                    if counter >= patience:
                        model.load_state_dict(best_state)
                        pbar.update(num_iterations - step - 1)
                        break

            pbar.update(1)

    return model


In [None]:
dgp_model= train_DGP_minibatch(train_x, train_y_21, 
                               DeepGP_2, num_hidden_dgp_dims=10, inducing_num=100, 
                               num_iterations=10000, patience=50, 
                               device='cuda', batch_size=512, eval_every=100, eval_batch_size=1024, 
                               lr_variational = 0.1,lr=0.01)

Training DGP:   1%|          | 90/10000 [00:09<17:38,  9.37it/s]


NotPSDError: Matrix not positive definite after repeatedly adding jitter up to 1.0e-04.