# Packages

In [1]:
import math
import pandas as pd
import torch
import gpytorch
import warnings 
warnings.filterwarnings("ignore") #please be really sure about this one 

# Classes

In [2]:
class MultitaskGPModel(gpytorch.models.ExactGP): #Multi Class GP with 6 (correlated) outputs
    def __init__(self, train_x, train_y, likelihood):
        super(MultitaskGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.MultitaskMean(
            gpytorch.means.ConstantMean(), num_tasks=6
        )
        self.covar_module = gpytorch.kernels.MultitaskKernel(
            gpytorch.kernels.RBFKernel(ard_num_dims=5), num_tasks=6, rank=1
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultitaskMultivariateNormal(mean_x, covar_x)

# Functions

## Data reformatting

In [3]:
#filter_data_by_GEN: read full simulation data & filter for sequenced time points
def filter_data_by_GEN(data_path, values): 
    df = pd.read_csv(data_path, sep='\t', header=0) #read data 
    df['GEN'] = df['GEN'] - 1 #change from 1-based to 0-based
    filtered_df = df[df['GEN'].isin(values)] #filter for empirical available time points 

    return filtered_df

#select_columns_by_names: subset training data based on column names 
def select_columns_by_names(df, column_names):
    selected_columns = df[df['GEN']== 0] #for parameter extraction: removes duplicated entries
    selected_columns = selected_columns[column_names]
    selected_columns = selected_columns.to_numpy(dtype = "float")
    return torch.from_numpy(selected_columns).float().contiguous()


#get_CN_for_GEN get CN ( = response) for specific generation GEN
def get_CN_for_GEN(df, GEN):
    filtered_df = df[df['GEN'] == GEN] #filter for specific GEN
    
    if filtered_df.empty:
        return None 
    else:
        CN = filtered_df['m'].to_numpy(dtype = "float")
        return torch.from_numpy(CN).float().contiguous().flatten()

#prep_data: re-formatting of SLiMULATION output for GP 
def prep_data(data_path, input_params, time_points):
    df = filter_data_by_GEN(data_path=data_path, values=time_points) #filter data
    df_x = select_columns_by_names(df = df, column_names=input_params) #extract input parameters 
    y_list = [] 

    for gen in time_points[1:]: #exclude gen = 0 (no predictions); extract y-values (= CN | GEN)
        CN_values = get_CN_for_GEN(df, gen)
        y_list.append(CN_values)
    y = torch.stack(y_list, dim=-1) #reformat y-values 

    return df_x, y #return reformatted input & output data 

#calculate_rmse: calculate RMSE 
def calculate_rmse(predictions, observations):
    squared_error = (predictions - observations)**2
    mse = squared_error.mean(dim=0) #MSE across first dimension (= input parameters)
    rmse_per_timepoint = torch.sqrt(mse)
    overall_rmse = torch.sqrt(squared_error.mean()) #overall RMSE
    return rmse_per_timepoint, overall_rmse

## GP handling

In [4]:

#trainGP: train Gaussian process for <iterationcount> iterations, using learning rate <learning_rate> 
def trainGP(model, likelihood, train_x, train_y, learning_rate, iterationcount):
    model.train() #set to training modus
    likelihood.train()

    # Use the adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr= learning_rate)  # Includes GaussianLikelihood parameters

    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    for i in range(iterationcount): #train for <iterationcount> interations 
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, train_y)
        loss.backward()
        #if (i + 1) % 500==0 or i == 0: #output for tracking of loss on training data 
            #print('Iter %d/%d - Loss: %.3f' % (i + 1, iterationcount, loss.item()))
        if i == (iterationcount-1):
            lossCount = loss.item() #save Loss at end of training
        optimizer.step()

    model.eval() #set back to evaluation mode 
    likelihood.eval()
    return model, likelihood, lossCount #return trained model & loss at the end of trianing

#GPpredict: predict y|x with GP
def GPpredict(df, model, likelihood):
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        predictions = likelihood(model(df))
        mean = predictions.mean
        lower, upper = predictions.confidence_region() 
    return lower, mean, upper

# Variables

In [5]:
PATH_TRAIN = "../data/established/20240722/train-LHS-1000-allGen-betaDist-varReg-varP-rescaled.txt" #path training data
PATH_VAL = "../data/established/20240722/val-LHS-5000-allGen-betaDist-varReg-varP-rescaled.txt" #path validation data 
PATH_TEST = "../data/established/20240722/test-LHS-5000-allGen-betaDist-varReg-varP-rescaled.txt" #path test data 

ESTABLISHED_TP = [0, 10, 15, 20, 25, 30, 60] #sequenced time points: established P-element invasion
EARLY_TP = [0, 10, 20, 30, 40, 50, 60] #sequenced time points: early P-element invasion
INVASION_TYPE = "established" #type of invasion (early, established)
INPUT_PARAM = ['trans_prob', 'sel_alpha', 'sel_beta', 'l_pi', 'p_te']

if INVASION_TYPE not in ["early", "established"]: #time points used for GP 
   raise ValueError("Specify the type of invasion \"early\", \"established\"")
else:
   if INVASION_TYPE == "early":
        TIME_POINTS = EARLY_TP
   else: 
        TIME_POINTS = ESTABLISHED_TP

LEARNING_RATE = 0.01 #GP: learning rate 
ITERATION_COUNT = 50 #GP: number of iterations per training round 
TRAINING_ROUNDS = 40 #GP: number of training rounds


# GP

## Set up Data

In [6]:
train_x, train_y = prep_data(data_path=PATH_TRAIN, input_params=INPUT_PARAM, time_points=TIME_POINTS) #generate training data 
val_x, val_y = prep_data(data_path=PATH_VAL, input_params=INPUT_PARAM, time_points=TIME_POINTS) #generate validation data 
test_x, test_y = prep_data(data_path=PATH_TEST, input_params=INPUT_PARAM, time_points=TIME_POINTS) #generate test data 

for i in train_x, train_y, val_x, val_y, test_x, test_y : #sanity checks 
    print(i.shape)
    print(i.dtype)

torch.Size([1000, 5])
torch.float32
torch.Size([1000, 6])
torch.float32
torch.Size([5000, 5])
torch.float32
torch.Size([5000, 6])
torch.float32
torch.Size([5000, 5])
torch.float32
torch.Size([5000, 6])
torch.float32


## Set up GP

In [7]:
likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=6) #set-up GP 
model = MultitaskGPModel(train_x, train_y, likelihood)

## Training 

In [8]:
for i in range(TRAINING_ROUNDS):
        print(f"Training Round {i+1} out of {TRAINING_ROUNDS}. Please be patient...")
        if i == 0 : #if new training attempt, create new file with header
                with open('../models/stats-GP.txt', 'w') as file:
                    myheader = "invasion_type\tround\ttraining_loss\ttraining_rmse\tvalidation_rmse\tneg_count_val\n"
                    file.write(myheader)
                    file.close()
        with open('../models/stats-GP.txt', 'a') as file:
            model, likelihood, loss = trainGP(model=model, likelihood=likelihood,train_x=train_x, train_y=train_y, iterationcount=ITERATION_COUNT, learning_rate=LEARNING_RATE) #train
            torch.save(model.state_dict(), f"../models/P-GP-{i}.pth") #save model snapshot 
            _, pred_train, _ = GPpredict(df = train_x, model=model, likelihood=likelihood) #predicted output | training data 
            _, rmse_train = calculate_rmse(predictions=pred_train, observations=train_y) #RMSE training data 
            _, pred_val,_ = GPpredict(df=val_x, model=model, likelihood=likelihood) #predicted output | validation data 
            _, rmse_val = calculate_rmse(predictions=pred_val, observations=val_y) #RMSE validation data 
            neg_count = pred_val < 0
            neg_count = neg_count.sum().item() #number of negative pred. values 

            myoutput=f"{INVASION_TYPE}\t{i}\t{loss}\t{rmse_train}\t{rmse_val}\t{neg_count}\n" #generate output 
            file.write(myoutput + '\n')

Training Round 1 out of 40. Please be patient...
Training Round 2 out of 40. Please be patient...
Training Round 3 out of 40. Please be patient...
Training Round 4 out of 40. Please be patient...
Training Round 5 out of 40. Please be patient...
Training Round 6 out of 40. Please be patient...
Training Round 7 out of 40. Please be patient...
Training Round 8 out of 40. Please be patient...
Training Round 9 out of 40. Please be patient...
Training Round 10 out of 40. Please be patient...
Training Round 11 out of 40. Please be patient...
Training Round 12 out of 40. Please be patient...
Training Round 13 out of 40. Please be patient...
Training Round 14 out of 40. Please be patient...
Training Round 15 out of 40. Please be patient...
Training Round 16 out of 40. Please be patient...
Training Round 17 out of 40. Please be patient...
Training Round 18 out of 40. Please be patient...
Training Round 19 out of 40. Please be patient...
Training Round 20 out of 40. Please be patient...
Training 

## Prediction

In [9]:
stats = pd.read_csv('../models/stats-GP.txt', sep='\t', header=0) #load model with lowest RMSE on validation data 
min_id= stats['validation_rmse'].idxmin()
toload = f"../models/P-GP-{stats['round'][min_id]}.pth"
print(toload)
model.load_state_dict(torch.load(toload))

lower_val, pred_val, upper_val = GPpredict(df=val_x, model=model, likelihood=likelihood) #sanity check 
_, rmse_val = calculate_rmse(predictions=pred_val, observations=val_y)
print(rmse_val)
print(stats['validation_rmse'][min_id])

lower_test, pred_test, upper_test = GPpredict(df = test_x, model=model, likelihood=likelihood) #predicted output | test data 
rmse_test_TP, rmse_test = calculate_rmse(predictions=pred_test, observations=test_y) #RMSE test data 
print(rmse_test_TP)
print(rmse_test)

../models/P-GP-19.pth
tensor(0.6588)
0.658774733543396
tensor([0.3142, 0.4457, 0.5365, 0.6130, 0.6923, 1.0114])
tensor(0.6407)


In [10]:
test_df = filter_data_by_GEN(data_path=PATH_TEST, values=TIME_POINTS)
test_df['pred'] = float('nan')
test_df['lower'] = float('nan')
test_df['upper'] = float('nan')

for gen in range(1, len(TIME_POINTS)): #add predicted values & credible interval (attention: no prediction for generation 0)
    test_df.loc[test_df['GEN'] == TIME_POINTS[gen], 'pred'] = pred_test[:, (gen-1)].numpy()
    test_df.loc[test_df['GEN'] == TIME_POINTS[gen], 'lower'] = lower_test[:, (gen-1)].numpy()
    test_df.loc[test_df['GEN'] == TIME_POINTS[gen], 'upper'] = upper_test[:, (gen-1)].numpy()

test_df.to_csv('P-GP-predict.txt', sep='\t', index=False) #save predictions