# Packages

In [1]:
import math
import pandas as pd
import numpy as np
import torch
import gpytorch
import gc

from torch.utils.data import TensorDataset, DataLoader #batch loading 
from torch.cuda import is_available as cuda_available, empty_cache #GPU usage 

from emukit.core import ParameterSpace, ContinuousParameter
from emukit.core.initial_designs.latin_design import LatinDesign

import warnings 
#warnings.filterwarnings("ignore") #please be really sure about this one 


# Classes

In [2]:
class MultitaskGPModel(gpytorch.models.ExactGP): #Multi Class GP with 6 (correlated) outputs
    def __init__(self, train_x, train_y, likelihood):
        super(MultitaskGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.MultitaskMean(
            gpytorch.means.ConstantMean(), num_tasks=6
        )
        self.covar_module = gpytorch.kernels.MultitaskKernel(
            gpytorch.kernels.RBFKernel(ard_num_dims=5), num_tasks=6, rank=1
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultitaskMultivariateNormal(mean_x, covar_x)

# Functions

## Data reformatting

In [3]:
#filter_data_by_GEN: read full simulation data & filter for sequenced time points
def filter_data_by_GEN(data_path, values): 
    df = pd.read_csv(data_path, sep='\t', header=0) #read data 
    df['GEN'] = df['GEN'] - 1 #change from 1-based to 0-based
    filtered_df = df[df['GEN'].isin(values)] #filter for empirical available time points 

    return filtered_df

#select_columns_by_names: subset training data based on column names 
def select_columns_by_names(df, column_names):
    selected_columns = df[df['GEN']== 0] #for parameter extraction: removes duplicated entries
    selected_columns = selected_columns[column_names]
    selected_columns = selected_columns.to_numpy(dtype = "float")
    return torch.from_numpy(selected_columns).float().contiguous()


#get_CN_for_GEN get CN ( = response) for specific generation GEN
def get_CN_for_GEN(df, GEN):
    filtered_df = df[df['GEN'] == GEN] #filter for specific GEN
    
    if filtered_df.empty:
        return None 
    else:
        CN = filtered_df['m'].to_numpy(dtype = "float")
        return torch.from_numpy(CN).float().contiguous().flatten()

#prep_data: re-formatting of SLiMULATION output for GP 
def prep_data(data_path, input_params, time_points):
    df = filter_data_by_GEN(data_path=data_path, values=time_points) #filter data
    df_x = select_columns_by_names(df = df, column_names=input_params) #extract input parameters 
    y_list = [] 

    for gen in time_points[1:]: #exclude gen = 0 (no predictions); extract y-values (= CN | GEN)
        CN_values = get_CN_for_GEN(df, gen)
        y_list.append(CN_values)
    y = torch.stack(y_list, dim=-1) #reformat y-values 

    return df_x, y #return reformatted input & output data 

#calculate_rmse: calculate RMSE 
def calculate_rmse(predictions, observations):
    squared_error = (predictions - observations)**2
    mse = squared_error.mean(dim=0) #MSE across first dimension (= input parameters)
    rmse_per_timepoint = torch.sqrt(mse)
    overall_rmse = torch.sqrt(squared_error.mean()) #overall RMSE
    return rmse_per_timepoint, overall_rmse

## GP handling

In [4]:
#GPpredict_batch: prediction with GP with batch loading onto GPU
def GPpredict_batch(df, model, likelihood):
    if cuda_available(): # Shift data to GPU if available 
        df = df.cuda()
    
    loader = DataLoader(TensorDataset(df), batch_size=1024, shuffle=False) # Batch loading onto GPU 
    lower, mean, upper = [], [], []

    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        for batch in loader:
            batch_predictions = likelihood(model(batch[0]))
            batch_means = batch_predictions.mean
            batch_lowers, batch_uppers = batch_predictions.confidence_region()
            
            if cuda_available():
                lower.append(batch_lowers.cpu())
                mean.append(batch_means.cpu())
                upper.append(batch_uppers.cpu())
            else:
                lower.append(batch_lowers)
                mean.append(batch_means)
                upper.append(batch_uppers)

    lower = torch.cat(lower, dim=0)
    mean = torch.cat(mean, dim=0)
    upper = torch.cat(upper, dim=0)
    
    return lower, mean, upper

#trainGP: train Gaussian process for <iterationcount> iterations, using learning rate <learning_rate> 
def trainGP(model, likelihood, train_x, train_y, learning_rate, iterationcount):
    model.train() #set to training modus
    likelihood.train()

    # Use the adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr= learning_rate)  # Includes GaussianLikelihood parameters

    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    for i in range(iterationcount): #train for <iterationcount> interations 
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, train_y)
        loss.backward()
        #if (i + 1) % 500==0 or i == 0: #output for tracking of loss on training data 
            #print('Iter %d/%d - Loss: %.3f' % (i + 1, iterationcount, loss.item()))
        if i == (iterationcount-1):
            lossCount = loss.item() #save Loss at end of training
        optimizer.step()
        if cuda_available():
            empty_cache()

    model.eval() #set back to evaluation mode 
    likelihood.eval()
    return model, likelihood, lossCount #return trained model & loss at the end of trianing

#singlePrediction: generate prediction for a single input parameter combination
    #trans_prob: transposition probability
    #sel_alpha: alpha parameter, DFE beta distribution
    #sel_beta: beta parameter, DFE beta distribution
    #l_pi: proportion of chromosome with regulatory function
    #model: GP model
    #likelihood: GP likelihood 
def singlePrediction(trans_prob, sel_alpha, sel_beta, l_pi, p_te, model, likelihood):
    param_array = np.array([trans_prob, sel_alpha, sel_beta, l_pi, p_te]) #generate input param array
    param_array = torch.from_numpy(param_array).float().contiguous() #reformat 
    param_array = param_array.view(1, -1)
    _, pred_mean, _ = GPpredict_batch(df=param_array, model=model, likelihood=likelihood) #obtain GP prediction
    return pred_mean


## Empirical Data Comparison

In [5]:
#calculate_sum_nrsme: calculate NRMSE sum between empirical data and GP prediction
def calculate_sum_nrmse(obs_df, pred_torch, time_points):
    pred = pred_torch.numpy()
    diff_stat = []

    for gen in range(len(time_points)):
        CN_gen = obs_df[obs_df['gen'] == time_points[gen]]['CN'].to_numpy()
        nrmse_gen = np.sqrt(np.sum((CN_gen - pred[gen])**2)/len(CN_gen))/np.mean(CN_gen)
        diff_stat.append(nrmse_gen)
    return np.sum(diff_stat)


#top_x_candidates: return top_x rows of numpy array x with lowest score 
def top_x_candidates(x, score, top_x):
    y = np.column_stack((x, score))
    y_sorted = y[y[:, -1].argsort()]
    return y_sorted[:top_x]


# Variables

In [6]:
PATH_TRAIN_EST = "../data/established/20240722/train-LHS-1000-allGen-betaDist-varReg-varP-rescaled.txt" #path training data, established
PATH_VAL_EST = "../data/established/20240722/val-LHS-5000-allGen-betaDist-varReg-varP-rescaled.txt" #path validation data, established 
PATH_STATS_EST = "../models/established-betaDist-varReg-varP-domainChange/" #path to model snapshots & RMSE file, established
PATH_EMP_EST = "../data/established/P-established-emp.txt" #empirical data (generation \t replicate \t CN), established 
ESTABLISHED_TP = [0, 10, 15, 20, 25, 30, 60] #sequenced time points: established P-element invasion

PATH_TRAIN_EARLY = "../data/early/20240720/train-LHS-1000-allGen-betaDist-varReg-varP-scaled.txt" #path training data, established
PATH_VAL_EARLY = "../data/early/20240720/val-LHS-5000-allGen-betaDist-varReg-varP-scaled.txt" #path validation data, established 
PATH_STATS_EARLY = "../models/early-betaDist-varReg-varP-domainChange/" #path to model snapshots & RMSE file, established
PATH_EMP_EARLY = "../data/early/P-early-emp.txt" #empirical data (generation \t replicate \t CN), established 
EARLY_TP = [0, 10, 20, 30, 40, 50, 60] #sequenced time points: early P-element invasion

INPUT_PARAM = ['trans_prob', 'sel_alpha', 'sel_beta', 'l_pi', 'p_te'] #names var. input parameters

PARAM_RANGES = ParameterSpace([ #define the parameter ranges 
    ContinuousParameter("trans_prob", 0.15, 0.5),
    ContinuousParameter("sel_alpha", 0.001, 0.5),
    ContinuousParameter("sel_beta",10 , 20),
    ContinuousParameter("l_pi", 0.01, 0.10),
    ContinuousParameter("p_te", 0.15, 0.5)
])

N_SAMPLE_LHS = 1000000 #sample size from LHS for prediction & comparison to empirical data 

np.random.seed(42)

# Candidates

In [7]:
candidates = LatinDesign(PARAM_RANGES).get_samples(N_SAMPLE_LHS) #LHS sample
candidates = torch.from_numpy(candidates).float().contiguous() #reformat
candidates_numpy = candidates.numpy()

print(candidates.shape)

torch.Size([1000000, 5])


# GP: early 

## Set up Data

In [8]:
train_x, train_y = prep_data(data_path=PATH_TRAIN_EARLY, input_params=INPUT_PARAM, time_points=EARLY_TP) #generate training data
val_x, val_y = prep_data(data_path=PATH_VAL_EARLY, input_params=INPUT_PARAM, time_points=EARLY_TP) #generate validation data 

for i in train_x, train_y, val_x, val_y: #sanity checks 
    print(i.shape)
    print(i.dtype)

torch.Size([1000, 5])
torch.float32
torch.Size([1000, 6])
torch.float32
torch.Size([5000, 5])
torch.float32
torch.Size([5000, 6])
torch.float32


## Set up GP

In [9]:
likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=6) #set-up GP 
model = MultitaskGPModel(train_x, train_y, likelihood)
model, likelihood, _ = trainGP(model=model, likelihood=likelihood,  learning_rate=0.01, iterationcount=1, train_x=train_x, train_y=train_y) #pre-requisite for loading hyper parameters

if cuda_available():
    model = model.cuda()
    likelihood = likelihood.cuda()
    train_x = train_x.cuda()
    train_y = train_y.cuda()

## Load GP

In [10]:
stats = pd.read_csv(f"{PATH_STATS_EARLY}/stats-GP.txt", sep='\t', header=0) #load model with lowest RMSE on validation data 
min_id= stats['validation_rmse'].idxmin()
toload = f"{PATH_STATS_EARLY}P-GP-{stats['round'][min_id]}.pth"
print(toload)
model.load_state_dict(torch.load(toload))


../models/early-betaDist-varReg-varP-domainChange/P-GP-18.pth


<All keys matched successfully>

In [11]:
lower_val, pred_val, upper_val = GPpredict_batch(df=val_x, model=model, likelihood=likelihood) #sanity check 
_, rmse_val = calculate_rmse(predictions=pred_val, observations=val_y)
print(rmse_val)
print(stats['validation_rmse'][min_id])

tensor(0.4519)
0.4522340297698974


## Read empirical data

In [12]:
emp = pd.read_csv(PATH_EMP_EARLY, sep="\t", header=0)
print(emp)

    gen  replicate     CN
0     0          1   0.85
1    10          1   3.89
2    20          1  15.73
3    30          1  15.94
4    40          1  15.69
5    50          1  15.74
6    60          1  17.35
7     0          3   0.80
8    10          3   5.32
9    20          3  13.68
10   30          3  13.08
11   40          3  13.09
12   50          3  13.32
13   60          3  14.04
14    0          5   0.92
15   10          5   4.38
16   20          5  15.59
17   30          5  16.20
18   40          5  16.24
19   50          5  15.04
20   60          5  15.64


## Make Predictions

In [13]:
_, candidates_y_early, _ = GPpredict_batch(df = candidates, model=model, likelihood=likelihood) #prediction

In [14]:
print(candidates_numpy.shape)
print(candidates_y_early.shape)

(1000000, 5)
torch.Size([1000000, 6])


In [15]:

nrmse_stat_early = np.array([calculate_sum_nrmse(emp, candidate, time_points=EARLY_TP[1:]) for candidate in candidates_y_early])
print(f"Min: {np.min(nrmse_stat_early)}, Mean: {np.mean(nrmse_stat_early)}, Max: {np.max(nrmse_stat_early)}")


Min: 0.5959364685651489, Mean: 3.1005742176073583, Max: 39.27607949372286


In [16]:
top_x_candidates(x = candidates_numpy, score = nrmse_stat_early, top_x=5)


array([[ 0.27704808,  0.48397985, 10.32680511,  0.01369418,  0.18897092,
         0.59593647],
       [ 0.27390122,  0.47886661, 10.62766457,  0.01536737,  0.23766187,
         0.60544114],
       [ 0.27544683,  0.49470687, 10.45795536,  0.01469939,  0.28756487,
         0.60935597],
       [ 0.27978298,  0.46004182, 10.64761543,  0.01646402,  0.23240453,
         0.61238121],
       [ 0.28147733,  0.48667195, 10.54162502,  0.01345208,  0.19820848,
         0.61487642]])

In [17]:
del train_x, train_y, val_x, val_y, likelihood, model, lower_val, upper_val, pred_val, rmse_val #clean-up 
torch.cuda.empty_cache()



# GP: established 

## Set up Data 

In [18]:
train_x, train_y = prep_data(data_path=PATH_TRAIN_EST, input_params=INPUT_PARAM, time_points=ESTABLISHED_TP) #generate training data
val_x, val_y = prep_data(data_path=PATH_VAL_EST, input_params=INPUT_PARAM, time_points=ESTABLISHED_TP) #generate validation data 

for i in train_x, train_y, val_x, val_y: #sanity checks 
    print(i.shape)
    print(i.dtype)

torch.Size([1000, 5])
torch.float32
torch.Size([1000, 6])
torch.float32
torch.Size([5000, 5])
torch.float32
torch.Size([5000, 6])
torch.float32


## Set up GP

In [19]:
likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=6) #set-up GP 
model = MultitaskGPModel(train_x, train_y, likelihood)
model, likelihood, _ = trainGP(model=model, likelihood=likelihood,  learning_rate=0.01, iterationcount=1, train_x=train_x, train_y=train_y) #pre-requisite for loading hyper parameters

if cuda_available():
    model = model.cuda()
    likelihood = likelihood.cuda()
    train_x = train_x.cuda()
    train_y = train_y.cuda()

## Load GP

In [20]:
stats = pd.read_csv(f"{PATH_STATS_EST}/stats-GP.txt", sep='\t', header=0) #load model with lowest RMSE on validation data 
min_id= stats['validation_rmse'].idxmin()
toload = f"{PATH_STATS_EST}P-GP-{stats['round'][min_id]}.pth"
print(toload)
model.load_state_dict(torch.load(toload))


../models/established-betaDist-varReg-varP-domainChange/P-GP-19.pth


<All keys matched successfully>

In [21]:
lower_val, pred_val, upper_val = GPpredict_batch(df=val_x, model=model, likelihood=likelihood) #sanity check 
_, rmse_val = calculate_rmse(predictions=pred_val, observations=val_y)
print(rmse_val)
print(stats['validation_rmse'][min_id])

tensor(0.6583)
0.658774733543396


## Read empirical data

In [22]:
emp = pd.read_csv(PATH_EMP_EST, sep="\t", header=0)
print(emp)

    gen  replicate     CN
0    30          3  14.05
1    30          4  16.29
2    30          5  14.76
3    60          3  15.35
4    60          4  19.31
5    60          5  14.09
6     0          3   6.89
7     0          4   7.10
8     0          5   6.78
9    10          3   6.27
10   10          4   7.03
11   10          5   4.42
12   15          3  15.25
13   15          4  12.43
14   15          5  11.97
15   20          3  13.82
16   20          4  13.13
17   20          5  14.31
18   25          3  13.71
19   25          4  14.55
20   25          5  15.54


## Make Predictions

In [23]:

_, candidates_y_established, _ = GPpredict_batch(df = candidates, model=model, likelihood=likelihood) #prediction

In [24]:
print(candidates_numpy.shape)
print(candidates_y_established.shape)

(1000000, 5)
torch.Size([1000000, 6])


In [25]:

nrmse_stat_established = np.array([calculate_sum_nrmse(emp, candidate, time_points=ESTABLISHED_TP[1:]) for candidate in candidates_y_established])
print(f"Min: {np.min(nrmse_stat_established)}, Mean: {np.mean(nrmse_stat_established)}, Max: {np.max(nrmse_stat_established)}")


Min: 0.6854921428622563, Mean: 3.561230164878388, Max: 41.336325922172826


In [26]:
top_x_candidates(x = candidates_numpy, score = nrmse_stat_established, top_x=5)


array([[ 0.40375087,  0.45134974, 10.10243511,  0.01761737,  0.16050087,
         0.68549214],
       [ 0.40831941,  0.48807514, 10.6625948 ,  0.0160988 ,  0.15134943,
         0.68715219],
       [ 0.42350698,  0.45715711, 11.16285515,  0.01920705,  0.15243793,
         0.69026574],
       [ 0.36371821,  0.43606937, 11.92991543,  0.02373045,  0.15849608,
         0.69083283],
       [ 0.40364379,  0.4983246 , 11.27873516,  0.01612527,  0.15437238,
         0.69204572]])

In [27]:
del train_x, train_y, val_x, val_y, likelihood, model, lower_val, upper_val, pred_val, rmse_val #clean-up 
torch.cuda.empty_cache()

# Storage 

In [28]:
nrmse_stat_early = nrmse_stat_early.reshape(-1, 1) #reshape 1D array
nrmse_stat_established = nrmse_stat_established.reshape(-1, 1)

In [29]:
column_names = INPUT_PARAM + ["NRMSE_EARLY", "NRMSE_EST"] + [f"EARLY_TP{gen}" for gen in EARLY_TP[1:]] + [f"EST_TP{gen}" for gen in ESTABLISHED_TP[1:]] #generate column names 
res = np.concatenate((candidates_numpy, nrmse_stat_early, nrmse_stat_established, candidates_y_early, candidates_y_established), axis = 1 ) #generate result df 

print(len(column_names))
print(res.shape)


19
(1000000, 19)


In [30]:
np.savetxt("../pred/P-GP-joint-pred.txt", res, delimiter='\t', header='\t'.join(column_names), comments='', fmt = '%.6f') #save result df 