In [1]:
# This file costructs surrogate models for the input datasets
import numpy as np   
import pandas as pd
import os
import shutil
import json
import math
import time
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed, dump

# Torch specific module imports
import torch
import gpytorch 

# botorch specific modules
from botorch.fit import fit_gpytorch_model
from botorch.models.gpytorch import GPyTorchModel
from botorch.optim import optimize_acqf, optimize_acqf_discrete
from botorch import fit_gpytorch_mll
from botorch.acquisition.monte_carlo import (
    qExpectedImprovement,
    qNoisyExpectedImprovement,
)
from botorch.sampling.normal import SobolQMCNormalSampler
from botorch.exceptions import BadInitialCandidatesWarning
from botorch.acquisition import UpperConfidenceBound, ExpectedImprovement

# Plotting libraries
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Tick parameters
plt.rcParams['xtick.labelsize'] = 15
plt.rcParams['ytick.labelsize'] = 15
plt.rcParams['xtick.major.size'] = 5
plt.rcParams['xtick.major.width'] = 1
plt.rcParams['xtick.minor.size'] = 5
plt.rcParams['xtick.minor.width'] = 1
plt.rcParams['ytick.major.size'] = 5
plt.rcParams['ytick.major.width'] = 1
plt.rcParams['ytick.minor.size'] = 5
plt.rcParams['ytick.minor.width'] = 1

plt.rcParams['axes.labelsize'] = 15
plt.rcParams['axes.titlesize'] = 15
plt.rcParams['legend.fontsize'] = 15

# User defined python classes and files
import input_class 
import code_inputs as model_input
import utils_dataset as utilsd
import surrogate_models
import kmeans as km

# Set the random seeds
np.random.seed(0)
torch.manual_seed(0)

Using cpu device


<torch._C.Generator at 0x14747c8b0>

#### K means clustering

In [4]:
Input = input_class.inputs(input_path='../datasets/')
XX_prop, YY, descriptors = Input.read_inputs()
XX_comp_df, YY_df = Input.get_comp()
XX_comp_df

Unnamed: 0,num carbon,num fluorine,num hydrogen,num nitrogen,num oxygen,num sulfur,num silicon
0,360,0,216,144,72,0,0
1,360,0,216,144,144,0,0
2,432,0,360,144,72,0,0
3,360,0,144,216,216,0,0
4,360,0,144,216,216,0,0
...,...,...,...,...,...,...,...
69835,996,0,576,96,0,0,0
69836,1020,0,576,48,0,0,0
69837,1360,0,768,64,0,0,0
69838,1888,0,1152,128,128,0,0


In [5]:
clustered_dfs = km.k_means(XX_comp_df, YY_df, model_input.NUM_CLUSTER)
sample_dfs = km.draw_samples(clustered_dfs, sample_fraction = 1.00)
samples = km.concat(sample_dfs)
samples

Unnamed: 0,num carbon,num fluorine,num hydrogen,num nitrogen,num oxygen,num sulfur,num silicon,deliverable capacity [v STP/v]
0,832,0,448,384,0,0,0,165.565439
1,1152,0,832,128,64,0,0,152.524690
2,1376,0,896,256,64,0,0,115.996501
3,864,0,720,192,0,0,0,143.024802
4,1088,0,768,128,0,0,0,153.528996
...,...,...,...,...,...,...,...,...
69835,1536,0,960,160,0,0,0,110.196985
69836,1440,0,1368,216,0,0,36,137.095297
69837,2560,0,1536,384,384,0,0,169.809763
69838,2784,0,1824,576,96,0,0,110.963253


#### Acquisition function 

In [6]:
## TODO: TO BE Check
bounds = torch.tensor([[-10.0], [12.0]])

batch_size = 1
num_restarts= 10 
raw_samples = 512

def optimize_acqf_and_get_observation(acq_func, X_test, Y_test):
    """Optimizes the acquisition function, and returns a new candidate"""
    # print(X_test)
    # print(Y_test)
    # optimize
    candidates, _ = optimize_acqf_discrete(
        acq_function=acq_func,
        choices=X_test,
        q=batch_size,
        max_batch_size=2048,
        num_restarts=num_restarts,
        raw_samples=raw_samples,  # used for intialization heuristic
        options={"batch_limit": 5, "maxiter": 200},
        unique=True
    )
    
    print(candidates)
    # observe new values
    new_x = candidates.detach()
    b = [1 if torch.all(X_test[i].eq(new_x)) else 0 for i in range(0,X_test.shape[0]) ]
    b = torch.tensor(b).to(torch.int)
    index = b.nonzero()[0][0]
    new_y = torch.reshape(Y_test[0,index],(1,1))
    
    X_test_new = X_test[torch.arange(0, X_test.shape[0]) != index, ...]
    Y_test_new = Y_test[..., torch.arange(0, Y_test.shape[1]) != index]
    
    return new_x, new_y, index, X_test_new, Y_test_new

#### GP Train Function

In [28]:
def create_train_test_data(cluster_dataXX, cluster_dataYY, random_seed):
    if model_input.STANDARDIZE:
        cluster_dataXX, scalerX_transform = utilsd.standardize_data(cluster_dataXX)
        cluster_dataYY, scalerY_transform = utilsd.standardize_data(cluster_dataYY.reshape(-1,1))
    else:
        scalerX_transform = None
        scalerY_transform = None
    
    ## TODO : Incase for feature selection
        # ....
        # ....
        # ....

    # Create train and test sets
    X_train, X_test, Y_train, Y_test = train_test_split(cluster_dataXX, cluster_dataYY, test_size=model_input.TEST_SIZE, random_state=random_seed)

    # Convert to tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    Y_train = np.transpose(Y_train) # IMP : Has to have only one row for GP training
    Y_train = torch.tensor(Y_train, dtype=torch.float32)
    Y_test = np.transpose(Y_test)
    Y_test = torch.tensor(Y_test, dtype=torch.float32)

    return X_train, X_test, Y_train, Y_test, scalerX_transform, scalerY_transform

def train_gp(X_train, X_test, Y_train, Y_test, model=None):
    best_observed = []
    # Finding best value in initial data
    if model_input.MAXIMIZATION:
        best_observed_value = Y_train.max()
        optimal_solution = torch.cat([Y_train[0],Y_test[0]]).max()
    else:
        best_observed_value = Y_train.min()
        optimal_solution = torch.cat([Y_train[0],Y_test[0]]).min()
    
    # If optimal value is present in the initial dataset sample remove it  
    if (best_observed_value.eq(optimal_solution)) and model_input.MAXIMIZATION:
        print('Max in training set, removing it before training models.')
        optimal_position = torch.argmax(Y_train)
        
        # Add max value to test/exploration set
        X_add_toTest = torch.reshape(X_train[optimal_position,:],(1,X_train.shape[1]))
        X_test = torch.cat([X_test,X_add_toTest])
        Y_add_toTest = torch.reshape(optimal_solution,(1,1))      
        Y_test = torch.cat((Y_test,Y_add_toTest),1)
        
        # Remove max value from training set
        X_train = X_train[torch.arange(0, X_train.shape[0]) != optimal_position, ...]
        Y_train = Y_train[..., torch.arange(0, Y_train.shape[1]) != optimal_position]
        
        # Update best observed value
        best_observed_value = Y_train.max()
        
    elif (best_observed_value.eq(optimal_solution)) and not model_input.MAXIMIZATION:
        print('Min in training set, removing it before training models.')
        optimal_position = torch.argmin(Y_train)
        
        # Add min value to test/exploration set
        X_add_toTest = torch.reshape(X_train[optimal_position,:],(1,X_train.shape[1]))
        X_test = torch.cat([X_test,X_add_toTest])
        Y_add_toTest = torch.reshape(optimal_solution,(1,1))      
        Y_test = torch.cat((Y_test,Y_add_toTest),1)
        
        # Remove min value from training set
        X_train = X_train[torch.arange(0, X_train.shape[0]) != optimal_position, ...]
        Y_train = Y_train[..., torch.arange(0, Y_train.shape[1]) != optimal_position]
        
        # Update best observed value
        best_observed_value = Y_train.min()
    
    # Initialize data for training gp-0 and gp-l models
    X_train0, Y_train0, X_test0, Y_test0 = X_train, Y_train, X_test, Y_test
            
    n_batch = int(model_input.N_BATCH_PER_TRIAL/model_input.N_SEARCH)
    
    # Initialize likelihood, GP model and acquisition function for the models
    #--------------------------- GP-0 ---------------------------#
    likelihood_gp0 = gpytorch.likelihoods.GaussianLikelihood()
    if model is None:
        model_gp0 = surrogate_models.ExactGPModel(X_train0, Y_train0, likelihood_gp0) 
    else:
        model_gp0 = model
    AcqFunc_0 = ExpectedImprovement(model=model_gp0, best_f=best_observed_value, maximize=model_input.MAXIMIZATION)
    best_observed.append(best_observed_value)  # Appending to best_observed list for the given trial
    
    # run N_BATCH rounds of BayesOpt after the initial random batch
    for iteration in range(1, n_batch + 1):
        # Time start of iteration and end
        t0 = time.monotonic()
        if ((iteration-1)%model_input.N_UPDATE==0):
            # fit the models every 10 iterations
            model_gp0, likelihood_gp0 = surrogate_models.train_surrogate_gp0(X_train0, Y_train0)
    
        # optimize and get new observation using acquisition function
        new_x0, new_y0, index, X_test_new0, Y_test_new0 = optimize_acqf_and_get_observation(AcqFunc_0, X_test0, Y_test0)
        
        # Update remaining choices tensor
        X_test0 = X_test_new0
        Y_test0 = Y_test_new0

        # Update training points
        X_train0 = torch.cat([X_train0, new_x0])
        Y_train0 = torch.cat([Y_train0[0], new_y0[0]])
        Y_train0 = torch.reshape(Y_train0,(1,Y_train0.shape[0]))

        # update progress
        if model_input.MAXIMIZATION:
            best_value_ei0 = Y_train0.max()
        elif not model_input.MAXIMIZATION:
            best_value_ei0 = Y_train0.min()
        best_observed.append(best_value_ei0)

        # AcqFunc_0 = UpperConfidenceBound(model_gp0, beta=0.1) 
        AcqFunc_0 = ExpectedImprovement(model=model_gp0, best_f=best_value_ei0, maximize=model_input.MAXIMIZATION)

        # Time end of iteration
        t1 = time.monotonic()
    
        if model_input.VERBOSE:
            print(
                f"\nBatch {iteration:>2}: best_value (GP-0) = ",
                f"({best_value_ei0:>4.2f}",
                end="",)
            print(f'Iteration time = {t1-t0:>4.2f}.')

    # t1 = time.monotonic()
    # print(f"time = {t1-t0:>4.2f}.")

    return [best_observed, X_train0, X_test0, Y_train0, Y_test0, model_gp0] 

#### Main Function

In [47]:
warnings.filterwarnings("ignore", category=BadInitialCandidatesWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Create a new directory if it does not exist
isExist = os.path.exists(model_input.OUT_FOLDER)
if not isExist:
    os.makedirs(model_input.OUT_FOLDER)
    print("The new directory is created!", model_input.OUT_FOLDER)

shutil.copy2('surrogate_models.py',model_input.OUT_FOLDER)

# Train each GP model sequentially first then apply the epsilon greed algorithm
for trial in range(1, model_input.N_TRIALS + 1):
    t0 = time.monotonic()
    if model_input.RANDOM_SEED == 'time':
        random_seed = int(t0)
    elif model_input.RANDOM_SEED == 'iteration':
        random_seed = trial

    print(f"\n -------------------- Trial {trial:>2} of {model_input.N_TRIALS} --------------------\n", end="")

    best_observed_all_clusters = np.zeros((model_input.N_TRIALS, model_input.NUM_CLUSTER, model_input.N_BATCH_PER_TRIAL))
    print(best_observed_all_clusters.shape)
    X_train_all_clusters = []
    X_test_all_clusters = []
    Y_train_all_clusters = []
    Y_test_all_clusters = []
    model_gps_all_clusters = []

    # Creating the initial training and test sets for each cluster
    for cluster_idx in range(model_input.NUM_CLUSTER):
        print(f"\n -------------------- Cluster {cluster_idx:>2} of {model_input.NUM_CLUSTER} --------------------\n", end="")
        XX_desc = list(sample_dfs[cluster_idx].columns[:-1])
        YY_desc = sample_dfs[cluster_idx].columns[-1]
        (
            X_train_idx,
            X_test_idx,
            Y_train_idx,
            Y_test_idx,
            scalerX, 
            scalerY
        ) = create_train_test_data(sample_dfs[cluster_idx][XX_desc].to_numpy(), sample_dfs[cluster_idx][YY_desc].to_numpy(), random_seed)
        dump(scalerX, os.path.join(model_input.OUT_FOLDER, f'scalerX_{cluster_idx}.joblib'))
        dump(scalerY, os.path.join(model_input.OUT_FOLDER, f'scalerY_{cluster_idx}.joblib'))
        X_train_all_clusters.append(X_train_idx)
        X_test_all_clusters.append(X_test_idx)
        Y_train_all_clusters.append(Y_train_idx)
        Y_test_all_clusters.append(Y_test_idx)
    
    # Initially training of GPs done in parallel by using joblib
    results_cluster_0, results_cluster_1, results_cluster_2 = Parallel(n_jobs=-1)(delayed(train_gp)(X_train_all_clusters[i], X_test_all_clusters[i], Y_train_all_clusters[i], Y_test_all_clusters[i]) for i in range(model_input.NUM_CLUSTER))

    for i, results_i_cluster in [results_cluster_0, results_cluster_1, results_cluster_2]:
        best_observed_all_clusters[trial-1][i][0:int(model_input.N_BATCH_PER_TRIAL/model_input.N_SEARCH)] = results_i_cluster[0] 
        X_train_all_clusters.append(results_i_cluster[1])
        X_test_all_clusters.append(results_i_cluster[2])
        Y_train_all_clusters.append(results_i_cluster[3])
        Y_test_all_clusters.append(results_i_cluster[4])
        model_gps_all_clusters.append(results_i_cluster[5])

    # print(f'\n')
    # print(f'Starting the epsilon greedy search')
    # print(f'\n')    

    # # Now apply the epsilon greedy algorithm and choose which GP to train next
    # for i in range(model_input.N_SEARCH):
    #     random_number = np.random.rand()
    #     epsilon = model_input.EPSILON
    #     # Explore using the Epsilon Greedy Exploration Strategy
    #     if random_number <= epsilon:
    #         # Selecting a number between 1,2 and 3
    #         cluster_idx = np.random.choice(model_input.NUM_CLUSTER)
    #     else:
    #         # Exploit best known action
    #         cluster_idx = np.argmax(best_observed_all_clusters[i][-1] for i in range(model_input.NUM_CLUSTER))
    #     print(f'Iteration {i} : Cluster {cluster_idx} is selected for training')
    #     results = train_gp(X_train_all_clusters[cluster_idx], \
    #                        X_test_all_clusters[cluster_idx], \
    #                        Y_train_all_clusters[cluster_idx], \
    #                        Y_test_all_clusters[cluster_idx], \
    #                        model=model_gps_all_clusters[cluster_idx])
        
    #     # Add the best values
    #     best_observed_all_clusters[cluster_idx].extend(best_observed_idx)
    #     X_train_all_clusters[cluster_idx] = results[1]
    #     X_test_all_clusters[cluster_idx] = results[2]
    #     Y_train_all_clusters[cluster_idx] = results[3]
    #     Y_test_all_clusters[cluster_idx] = results[4]
    #     model_gps_all_clusters[cluster_idx] = results[5]



 -------------------- Trial  1 of 1 --------------------
(1, 3, 1)

 -------------------- Cluster  0 of 3 --------------------

 -------------------- Cluster  1 of 3 --------------------

 -------------------- Cluster  2 of 3 --------------------
Using cpu deviceUsing cpu device

Using cpu device
tensor([[-0.6042, -0.0295, -0.7947,  1.3747,  1.5369, -0.0630, -0.1566]])

Batch  1: best_value (GP-0) =  (1.92Iteration time = 1.61.
tensor([[ 1.0603, -0.0367, -1.6548,  0.8714, -0.7460, -0.0717, -0.1818]])

Batch  1: best_value (GP-0) =  (2.55Iteration time = 17.92.
tensor([[-0.6552, -0.0414, -1.2317,  3.5658, -0.7353, -0.0663, -0.1789]])

Batch  1: best_value (GP-0) =  (2.71Iteration time = 84.47.


ValueError: too many values to unpack (expected 2)

In [48]:
for i, results_i_cluster in enumerate([results_cluster_0, results_cluster_1, results_cluster_2]):
    best_observed_all_clusters[trial-1][i][0:int(model_input.N_BATCH_PER_TRIAL/model_input.N_SEARCH)] = results_i_cluster[0] 
    X_train_all_clusters.append(results_i_cluster[1])
    X_test_all_clusters.append(results_i_cluster[2])
    Y_train_all_clusters.append(results_i_cluster[3])
    Y_test_all_clusters.append(results_i_cluster[4])
    model_gps_all_clusters.append(results_i_cluster[5])

ValueError: could not broadcast input array from shape (2,) into shape (1,)

In [23]:
l = np.zeros((3,3,4))

l[0][1][0:2] = [1,2]

print(l)


[[[0. 0. 0. 0.]
  [1. 2. 0. 0.]
  [0. 0. 0. 0.]]

 [[0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]]

 [[0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]]]


In [None]:
# Plot the best value obtained vs number of iterations
plt.figure(figsize=(10, 6))
for i in range(model_input.NUM_CLUSTER):
    plt.plot(best_observed_all_clusters[i], label=f'Cluster {i}')
plt.xlabel('Number of Iterations')
plt.ylabel('Best Value')
plt.legend()
plt.grid()
plt.show()