In [None]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import pymongo
import os
import sys
import re
import time
import yaml

import botorch
from botorch.models.transforms.input import AffineInputTransform
from botorch.models import MultiTaskGP
from botorch.fit import fit_gpytorch_mll

import gpytorch
from gpytorch.mlls import ExactMarginalLogLikelihood

In [None]:
# Select experimental setup for which we are training a model
setup = "ip2"

In [None]:
# Open credential file for database
with open(os.path.join(os.getenv('HOME'), 'db.profile')) as f:
    db_profile = f.read()

# Connect to the MongoDB database with read-only access
db = pymongo.MongoClient(
    host="mongodb05.nersc.gov",
    username="bella_sf_ro",
    password=re.findall('SF_DB_READONLY_PASSWORD=(.+)', db_profile)[0],
    authSource="bella_sf")["bella_sf"]

# Extract data from the database as pandas dataframe
collection = db[setup]
df = pd.DataFrame( list(collection.find()) )

In [None]:
# Extract the name of inputs and outputs for this setup
with open("../../config/variables.yml") as f:
    yaml_dict = yaml.safe_load( f.read() )
input_variables = yaml_dict[setup]["input_variables"]
input_names = [ v['name'] for v in input_variables.values() ] 
output_variables = yaml_dict[setup]["output_variables"]
output_names = [ v['name'] for v in output_variables.values() ]

In [None]:
# Visualize the dimensional data
ax = plt.figure().add_subplot(projection='3d')

ax.scatter( 
    df[input_names[0]], 
    df[input_names[-1]], 
    df[output_names[0]], 
    c=df.experiment_flag, 
    alpha=0.3)

ax.view_init(elev=40., azim=40, roll=0)
plt.xlabel(input_names[0])
plt.ylabel(input_names[-1])

<h2> Normalize with Affine Input Transformer

In [None]:
# Define the input and output normalizations

X = torch.tensor( df[ input_names ].values, dtype=torch.float )
input_transform = AffineInputTransform( 
    len(input_names), 
    coefficient=X.std(axis=0), 
    offset=X.mean(axis=0)
)

y = torch.tensor( df[ output_names ].values, dtype=torch.float )
output_transform = AffineInputTransform( 
    len(output_names), 
    coefficient=y.std(axis=0),
    offset=y.mean(axis=0)
)

if (min(X.mean(axis=0)) == 0):
    print("Mean value used for normalization is 0. This will lead to NaNs ",X.mean(axis=0))
if (min(X.std(axis=0)) == 0):
    print("RMS value used for normalization is 0. This will lead to NaNs ", X.std(axis=0))

In [None]:
# Apply normalization to the data set
norm_df = df.copy()
norm_df[input_names] = input_transform( torch.tensor( df[input_names].values ) )
norm_df[output_names] = output_transform( torch.tensor( df[output_names].values ) )

In [None]:
# Visualize the dimensional data
ax = plt.figure().add_subplot(projection='3d')

ax.scatter( 
    norm_df[input_names[0]], 
    norm_df[input_names[-1]], 
    norm_df[output_names[0]], 
    c=norm_df.experiment_flag, 
    alpha=0.3)

ax.view_init(elev=40., azim=40, roll=0)
plt.xlabel(input_names[0])
plt.ylabel(input_names[-1])

# Define a multi-input multi-task GP model

In [None]:
%%time
model = MultiTaskGP(
    torch.tensor( norm_df[['experiment_flag']+input_names].values ),
    torch.tensor( norm_df[output_names].values ),
    task_feature=0,
)
# Fit the model
mll = ExactMarginalLogLikelihood(model.likelihood, model)
fit_gpytorch_mll(mll)

In [None]:
z_sim = torch.tensor(norm_sim_data[['z_target_um']].values)
TOD_sim = torch.tensor(norm_sim_data[['TOD_fs3']].values)
protons_sim = torch.tensor(norm_sim_data[['n_protons']].values)
GVD_sim = torch.tensor(norm_sim_data[['GVD']].values)

z_exp = torch.tensor(norm_exp_data[['z_target_um']].values)
TOD_exp = torch.tensor(norm_exp_data[['TOD_fs3']].values)
protons_exp = torch.tensor(norm_exp_data[['n_protons']].values)
GVD_exp = torch.tensor(norm_exp_data[['GVD']].values)

sim_tr_list = torch.tensor(norm_sim_data[['z_target_um', 'TOD_fs3', 'GVD']].values)
exp_tr_list = torch.tensor(norm_exp_data[['z_target_um', 'TOD_fs3', 'GVD']].values)

In [None]:
# Start the timer
start_time = time.time()
gp_model = multi_input_multi_task_gp(sim_tr_list, protons_sim, exp_tr_list, protons_exp)
# Stop the timer
end_time = time.time()

# Compute the time taken
training_time = end_time - start_time
print(f"Training Time: {training_time:.2f} seconds")

zmin  = min( np.min(z_sim.numpy()), np.min(z_exp.numpy()))
zmax  = max( np.max(z_sim.numpy()), np.max(z_exp.numpy()))
print('Simulation data: (zmin,zmin) = (', zmin, ',', zmax, ')')

tod_min  = min( np.min(TOD_sim.numpy()), np.min(TOD_exp.numpy()))
tod_max  = max( np.max(TOD_sim.numpy()), np.max(TOD_exp.numpy()))
print('Simulation data: (tod_min,tod_max) = (', tod_min, ',', tod_max, ')')

gvd_min  = min( np.min(GVD_sim.numpy()), np.min(GVD_exp.numpy()))
gvd_max  = max( np.max(GVD_sim.numpy()), np.max(GVD_exp.numpy()))
print('Simulation data: (gvd_min, gvd_max) = (', gvd_min, ',', gvd_max, ')')

z_test_array = torch.tensor (np.linspace(zmin,zmax,100).reshape(-1, 1), dtype=torch.float32) 
TOD_test_array  = torch.tensor (np.linspace(tod_min,tod_max,100).reshape(-1, 1), dtype=torch.float32) 
GVD_test_array  = torch.tensor (np.linspace(gvd_min,gvd_max,100).reshape(-1, 1), dtype=torch.float32) 

predictions_on_train = gp_model.posterior(torch.cat([z_exp, TOD_exp, GVD_exp], dim=1) )

predictions = gp_model.posterior( torch.cat([z_test_array, TOD_test_array, GVD_test_array], dim=1) )

# Get confidence interval with respect to the task
# First column ([:, 0]) represents the lower bound of the 95% confidence interval for the first task (e.g., simulated data).
# Second column ([:, 1])represents the lower bound of the 95% confidence interval for the second task (e.g., experimental data).
lower_bound_sim = predictions.confidence_region()[0].detach().numpy()[:,0]
upper_bound_sim = predictions.confidence_region()[1].detach().numpy()[:,0]

lower_bound_exp = predictions.confidence_region()[0].detach().numpy()[:,1]
upper_bound_exp = predictions.confidence_region()[1].detach().numpy()[:,1]

In [None]:
fig, ax = plt.subplots(figsize=(12,9))
ax.scatter(z_exp, protons_exp, label='Experimental data test', alpha=0.6, color = 'blue')
plt.legend()
for fixed_tod_val in [tod_max]:#np.linspace(0,tod_max, 3):
    for fixed_gvd_val in [0.1]:#np.linspace(0,0,3):
        TOD_test_array  = torch.tensor (np.linspace(fixed_tod_val,fixed_tod_val,100).reshape(-1, 1), dtype=torch.float32) 
        GVD_test_array  = torch.tensor (np.linspace(fixed_gvd_val,fixed_gvd_val,100).reshape(-1, 1), dtype=torch.float32) 

        predictions = gp_model.posterior( torch.cat([z_test_array, TOD_test_array, GVD_test_array], dim=1) )
        
        plt.plot(z_test_array.numpy(),predictions.mean[:,1].detach().numpy(),  c='r', label = 'Predictions: TOD='+ str(np.round(fixed_tod_val,2)) +', GVD='+ str(np.round(fixed_gvd_val,2)) )
        plt.xlabel('Normalized z_target')
        plt.ylabel('Normalized number of protons')
        plt.legend()

# Visualize predictions along with confidence interval for the first task (simulation data)

In [None]:
fig, ax = plt.subplots(figsize=(12,9))

ax.scatter(z_exp, protons_exp, label='Experimental data set', alpha=0.6, color = 'blue')
ax.scatter(z_sim, protons_sim, label='Simulation data set', alpha=0.6, color = 'orange')

plt.plot(z_test_array.numpy(),predictions.mean[:,0].detach().numpy(), label='Predictions', c='r' )
plt.fill_between(z_test_array.numpy().flatten(), lower_bound_sim, upper_bound_sim, color='orange', alpha=0.1, label='Confidence interval for the second task (simulation data)')

plt.title("Normalized predictions of number of protons")
plt.xlabel('z_target_um')
plt.ylabel('n_protons')
plt.savefig('./' + 'n_protons_predictions_split_zval__' + '.png')
plt.legend()

# Visualize predictions along with confidence interval for the second task (experimental data)

In [None]:
fig, ax = plt.subplots(figsize=(12,9))

ax.scatter(z_exp, protons_exp, label='Experimental data set', alpha=0.6, color = 'blue')
ax.scatter(z_sim, protons_sim, label='Simulation data set', alpha=0.6, color = 'orange')

plt.plot(z_test_array.numpy(),predictions.mean[:,1].detach().numpy(), label='Predictions', c='r' )
plt.fill_between(z_test_array.numpy().flatten(), lower_bound_exp, upper_bound_exp, color='lightblue', alpha=0.25, label='Confidence interval for the second task (experimental data)')

plt.title("Normalized predictions of number of protons")
plt.xlabel('z_target_um')
plt.ylabel('n_protons')
plt.savefig('./' + 'n_protons_predictions_split_zval__' + '.png')
plt.legend()