# 2D surrogate training code

This notebook provides an example of the code used to train and evaluate the accuracy of surrogate models for the 2D toy problem discussed in Chapter 3 Section 3.2. 

Original experiments were run in Google Colab using TPUs. 

This code is replicated with varying training sizes to produce the full result set. Full code and models are available [here](https://drive.google.com/drive/folders/1J7srZbZPS6UhE43GFXP3Gkd3TmEvT-6f).

In [None]:
import autograd.numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
import keras.backend as K
from keras.models import Model
from keras.layers import Input, Dense, Lambda, dot, concatenate, PReLU, Dropout, advanced_activations
from keras.models import load_model
from keras.optimizers import adam_v2  # Note use Adam if adam_v2.Adam throws version error 
from keras.callbacks import LearningRateScheduler
import concurrent.futures
from time import time
import gc
from scipy import stats
from datetime import *
from time import time as time1
import os
import subprocess
#from google.colab import files

In [None]:
# Suppress retracing  and auograph error
import logging
import tensorflow as tf
tf.get_logger().setLevel(logging.ERROR)
tf.autograph.set_verbosity(0)

In [None]:
# Parameters for 2D toy problem
nInputDim = 2
nOutputDim = 1
T = 10 # Time steps for dynamic simulator
yearIndSet = np.arange(T)
base_hidden_size = 4 # Base number of nodes for each layer in surrogate model
nBatchSize = 10 # Batch size for training Keras neural network models

## Model architecture

Initial (t0) surrogate takes only latent parameters $\theta$ as an input while subsequent surrogates take latent parameters and previous output as described in Chapter 2 Section 2.1.1.

Surrogate models use architecture with 4 hidden layers, each with $\eta$ * 4 nodes where $\eta$ refers to model complexity as described in Chapter 3 Section 3.1.

In [None]:
def make_model_t0(complexity=1, lr=0.001, g_w=0.5):
    """"Make the initial surrogate that does not take prior outputs as inputs.
    The weight assigned to the loss function that fits gradients can be set with g_w.
    Model complexity integer and determines the number of nodes in each 
    hidden layer.
    """
    theta = Input(shape=nInputDim)
    h1 = Dense(complexity * base_hidden_size, activation="tanh")(theta)
    h2 = Dense(complexity * base_hidden_size, activation="tanh")(h1)
    h3 = Dense(complexity * base_hidden_size, activation="tanh")(h2)
    h4 = Dense(complexity * base_hidden_size, activation="tanh")(h3)
    out = Dense(nOutputDim, activation='linear')(h4)
    
    grad = Lambda(lambda x: K.gradients(x[0], [x[1]])[0], output_shape=nInputDim)([out, theta])
    model = Model(inputs=[theta], outputs=[out, grad])
    opt = adam_v2.Adam(learning_rate=lr)
    model.compile(loss=['mse', 'mse'], optimizer=opt, metrics=['accuracy'], loss_weights=[1-g_w, g_w])
    
    return model

def make_model_sub(complexity=1, lr=0.001, g_w=0.5):
    """"Make the subsequent surrogates that do not take prior outputs as inputs.
    The weight assigned to the loss function that fits gradients can be set with g_w.
    Model complexity integer and determines the number of nodes in each 
    hidden layer.
    """
    theta = Input(shape=nInputDim)
    prev_output = Input(shape=nOutputDim)
    concat = concatenate([theta, prev_output])
    h1 = Dense(complexity * base_hidden_size, activation="tanh")(concat)
    h2 = Dense(complexity * base_hidden_size, activation="tanh")(h1)
    h3 = Dense(complexity * base_hidden_size, activation="tanh")(h2)
    h4 = Dense(complexity * base_hidden_size, activation="tanh")(h3)
    out = Dense(nOutputDim, activation='linear')(h4)
    
    grad = Lambda(lambda x: K.gradients(x[0], [x[1]])[0], output_shape=nInputDim)([out, theta])
    model = Model(inputs=[theta, prev_output], outputs=[out, grad])
    opt = adam_v2.Adam(learning_rate=lr)
    model.compile(loss=['mse', 'mse'], optimizer=opt, metrics=['accuracy'], loss_weights=[1-g_w, g_w])
    
    return model

In [None]:
def build_models():
    """ Store models in nested dictionary of the form [time_step][complexity].
    """
    
    models_std = defaultdict(dict)
    models_grad = defaultdict(dict)
    
    for i in yearIndSet:
        for m in complexity_range:
        
            if i == 0:
                models_std[str(i)][str(m)] = make_model_t0(complexity=m, g_w=0)
                models_grad[str(i)][str(m)] = make_model_t0(complexity=m)
            else:
                models_std[str(i)][str(m)] = make_model_sub(complexity=m, g_w=0)
                models_grad[str(i)][str(m)] = make_model_sub(complexity=m)
         
        
    return models_std, models_grad

def fit_model(model, x, prev_output, y, y_grad, epochs, yearIdx, complexity, N, foldername):
    """ Fits the NN models.
    Note that when yearIdx=0, prev_output takes value -1 which is incorrect (should be no prev_output), 
    but it is not used.
    """
    if yearIdx == 0:
        model.fit([x], [y, y_grad], batch_size=nBatchSize, epochs=epochs, verbose=0, 
        use_multiprocessing=True, callbacks=[LearningRateScheduler(lr_time_based_decay)])
    else:
        model.fit([x, prev_output], [y, y_grad], batch_size=nBatchSize, epochs=epochs, verbose=0,
        use_multiprocessing=True, callbacks=[LearningRateScheduler(lr_time_based_decay)])
    
    filename = f'{foldername}/2D_example_N{N}_E{epochs}_t{yearIdx}_c{complexity}.h5'
    model.save(filepath=filename)

def train_models(curr_data, nEpoch, models_std, models_grad, foldername):
    
    simInData= curr_data['simInData']
    simOutData = curr_data['simOutData']
    simOutData_grad = curr_data['simOutData_grad']
    
    no_models = len(complexity_range)
    no_timesteps = len(yearIndSet)
    N = len(simInData)
#     print(f'Starting training of {no_models} models with {no_timesteps} time steps ({no_models * no_timesteps * 2} total models.) ')
#     print(f'-> N = {N}')
#     print(f'-> Epochs = {nEpoch}')

    foldername_grad = foldername+'_grad'

    t0 = time1()
    t_curr = time1()
    for yearIdx in yearIndSet:
        with concurrent.futures.ThreadPoolExecutor(max_workers=no_models) as executor:
            future1 = {executor.submit(fit_model, 
                                      m, 
                                      simInData.values, simOutData.iloc[:, yearIdx-1].values.flatten(), 
                                      simOutData.iloc[:, yearIdx].values, 
                                      simOutData_grad.loc[:, str(yearIdx)].values, 
                                      nEpoch, 
                                      yearIdx, 
                                      complexity, 
                                      N, 
                                      foldername) 
                    for complexity, m in models_std[str(yearIdx)].items()}
            
            future2 = {executor.submit(fit_model, 
                                      m, 
                                      simInData.values, simOutData.iloc[:, yearIdx-1].values.flatten(), 
                                      simOutData.iloc[:, yearIdx].values, 
                                      simOutData_grad.loc[:, str(yearIdx)].values, 
                                      nEpoch, 
                                      yearIdx, 
                                      complexity, 
                                      N, 
                                      foldername_grad) 
                    for complexity, m in models_grad[str(yearIdx)].items()}
        print(f'Completed timestep {yearIdx+1} in {time1() - t_curr:.02f}s...')
        t_curr = time1()
#     print(f'Completed in {(time1() - t0) / 60:.02f} mins.')


def load_models(epochs, N, foldername):
    std_models = defaultdict(dict)
    grad_models = defaultdict(dict)
    
    grad_foldername = foldername + '_grad' 
    
    for yearIdx in yearIndSet:
        for c in complexity_range:
            filename = f'{foldername}/2D_example_N{N}_E{epochs}_t{yearIdx}_c{c}.h5'
            filename_grad = f'{grad_foldername}/2D_example_N{N}_E{epochs}_t{yearIdx}_c{c}.h5'
            std_models[yearIdx][c] = load_model(filename)
            grad_models[yearIdx][c] = load_model(filename_grad)
    return std_models, grad_models

def gen_preds(train_data, test_data, std_models, grad_models):
    
    simInData_train = train_data['simInData']
    simInData_test = test_data['simInData']
    simOutData_train = train_data['simOutData'] # Only used for constucting DFs
    simOutData_test = test_data['simOutData'] # Only used for constucting DFs

    std_preds_train = {}
    std_preds_test = {}
    
    grad_preds_train = {}
    grad_preds_test = {}
    for c in complexity_range:
        std_preds_train[c] = pd.DataFrame(index=simOutData_train.index, columns=simOutData_train.columns)
        std_preds_test[c] = pd.DataFrame(index=simOutData_test.index, columns=simOutData_test.columns)
        grad_preds_train[c] = pd.DataFrame(index=simOutData_train.index, columns=simOutData_train.columns)
        grad_preds_test[c] = pd.DataFrame(index=simOutData_test.index, columns=simOutData_test.columns)
        
    for yearIdx in yearIndSet:
        for c in complexity_range:
            crtModel_std = std_models[yearIdx][c]
            crtModel_grad = grad_models[yearIdx][c]
            
#             print(f'T: {yearIdx}, c: {c}')
            
            if yearIdx == 0:
                # Generate predictinos and assign data
                std_preds_train[c].iloc[:, yearIdx] = crtModel_std.predict_on_batch(simInData_train.values)[0].flatten()
                std_preds_test[c].iloc[:, yearIdx] = crtModel_std.predict_on_batch(simInData_test.values)[0].flatten()
                grad_preds_train[c].iloc[:, yearIdx] = crtModel_grad.predict_on_batch(simInData_train.values)[0].flatten()
                grad_preds_test[c].iloc[:, yearIdx] = crtModel_grad.predict_on_batch(simInData_test.values)[0].flatten()
                
            else:
                
                std_preds_train[c].iloc[:, yearIdx] = crtModel_std.predict_on_batch([simInData_train.values, 
                                                    std_preds_train[c].iloc[:, yearIdx-1]])[0].flatten()
                std_preds_test[c].iloc[:, yearIdx] = crtModel_std.predict_on_batch([simInData_test.values, 
                                                    std_preds_test[c].iloc[:, yearIdx-1]])[0].flatten()
                
                grad_preds_train[c].iloc[:, yearIdx] = crtModel_grad.predict_on_batch([simInData_train.values, 
                                                        grad_preds_train[c].iloc[:, yearIdx-1]])[0].flatten()
                grad_preds_test[c].iloc[:, yearIdx] = crtModel_grad.predict_on_batch([simInData_test.values, 
                                                        grad_preds_test[c].iloc[:, yearIdx-1]])[0].flatten()

    gc.collect()
    
    return std_preds_train, std_preds_test, grad_preds_train, grad_preds_test
                

# Evaulation metrics
def rmse(pred, true):
    return np.sqrt(((pred - true)**2).mean().mean())

def corr(pred, true):
    return stats.pearsonr(pred.values.flatten(),true.values.flatten())[0]

def evaluate(train_data, test_data, std_preds_train, std_preds_test, grad_preds_train, grad_preds_test):
    
    simOutData_train = train_data['simOutData'] 
    simOutData_test = test_data['simOutData']
    
    std_train_res = defaultdict(dict)
    std_test_res = defaultdict(dict)
    grad_train_res = defaultdict(dict)
    grad_test_res = defaultdict(dict)
    
    for c in complexity_range:
        std_train_res[c]['rmse'] = rmse(std_preds_train[c], simOutData_train)
        std_train_res[c]['corr'] = corr(std_preds_train[c], simOutData_train)
        
        std_test_res[c]['rmse'] = rmse(std_preds_test[c], simOutData_test)
        std_test_res[c]['corr'] = corr(std_preds_test[c], simOutData_test)
        
        grad_train_res[c]['rmse'] = rmse(grad_preds_train[c], simOutData_train)
        grad_train_res[c]['corr'] = corr(grad_preds_train[c], simOutData_train)
        
        grad_test_res[c]['rmse'] = rmse(grad_preds_test[c], simOutData_test)
        grad_test_res[c]['corr'] = corr(grad_preds_test[c], simOutData_test)
    
    
    return std_train_res, std_test_res, grad_train_res, grad_test_res

def pipe(curr_data, test_data, nEpoch, N_train, foldername, idx):
    
    print(f'Beginning iteration {idx+1}')
    
    # Generate seperate foldername for each split to avoid confusion
    foldername = foldername + f'_ntrain{N_train}_{idx}'
    
    # Build models
    models_std, models_grad = build_models()
    
    print(f'Training models for dataset {idx+1}')
    t0 = time1()
    # Train models and save
    train_models(curr_data, nEpoch, models_std, models_grad, foldername)
    print(f'Trained models for dataset {idx+1} in {time1() - t0:.02f}s')
    
    print(f'Loading models for dataset {idx+1}')
    # Load models from file
    std_models, grad_models = load_models(nEpoch, N_train, foldername)
    
    print(f'Generating predictions for dataset {idx+1}')
    # Generate predictions
    std_preds_train, std_preds_test, grad_preds_train, grad_preds_test = gen_preds(curr_data, 
                                                test_data, std_models, grad_models)
    # Evaluate predictions
    std_train_res, std_test_res, grad_train_res, grad_test_res = evaluate(curr_data, test_data, 
                            std_preds_train, std_preds_test, grad_preds_train, grad_preds_test)
    
    return std_train_res, std_test_res, grad_train_res, grad_test_res, idx
    
    

def run(N_train, nEpoch, no_ds, train_data, test_data, foldername='saved_models_cmplx_tests'):
    
    N_test = len(test_data['simInData'])

    print(f'Beginning process with:')
    print(f'-> Epochs: {nEpoch}')
    print(f'-> Complexity range: {complexity_range}')
    print(f'-> {N_train} training samples')
    print(f"-> {N_test} test samples")
    print(f'-> Averging {no_ds} datasets')
    
    # Generate dataframes to store final results in
    std_train_rmse = pd.DataFrame(index=complexity_range, columns=range(no_ds))
    std_test_rmse = pd.DataFrame(index=complexity_range, columns=range(no_ds))
    grad_train_rmse = pd.DataFrame(index=complexity_range, columns=range(no_ds))
    grad_test_rmse = pd.DataFrame(index=complexity_range, columns=range(no_ds))
    
    std_train_corr = pd.DataFrame(index=complexity_range, columns=range(no_ds))
    std_test_corr = pd.DataFrame(index=complexity_range, columns=range(no_ds))
    grad_train_corr = pd.DataFrame(index=complexity_range, columns=range(no_ds))
    grad_test_corr = pd.DataFrame(index=complexity_range, columns=range(no_ds))
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        future_to_res = {executor.submit(pipe, curr_data, test_data, nEpoch, 
                                        N_train, foldername, i): i \
                         for i, curr_data in train_data.items()}
        for i in concurrent.futures.as_completed(future_to_res):
            std_train_res, std_test_res, grad_train_res, grad_test_res, idx = i.result()
            
            for c in complexity_range:
                std_train_rmse.loc[c, idx] = std_train_res[c]['rmse']
                std_train_corr.loc[c, idx] = std_train_res[c]['corr']
                
                std_test_rmse.loc[c, idx] = std_test_res[c]['rmse']
                std_test_corr.loc[c, idx] = std_test_res[c]['corr']
            
                grad_train_rmse.loc[c, idx] = grad_train_res[c]['rmse']
                grad_train_corr.loc[c, idx] = grad_train_res[c]['corr']
                
                grad_test_rmse.loc[c, idx] = grad_test_res[c]['rmse']
                grad_test_corr.loc[c, idx] = grad_test_res[c]['corr']
        
        print(f'Completed iteration {int(idx)+1} of {no_ds}')
        
    # Take averages/stds
    std_train_rmse['mean'] = std_train_rmse.mean(axis=1)
    std_train_rmse['std'] = std_train_rmse.std(axis=1)
    
    std_test_rmse['mean'] = std_test_rmse.mean(axis=1)
    std_test_rmse['std'] = std_test_rmse.std(axis=1)
    
    std_train_corr['mean'] = std_train_corr.mean(axis=1)
    std_train_corr['std'] = std_train_corr.std(axis=1)
    
    std_test_corr['mean'] = std_test_corr.mean(axis=1)
    std_test_corr['std'] = std_test_corr.std(axis=1)
    
    grad_train_rmse['mean'] = grad_train_rmse.mean(axis=1)
    grad_train_rmse['std'] = grad_train_rmse.std(axis=1)
    
    grad_test_rmse['mean'] = grad_test_rmse.mean(axis=1)
    grad_test_rmse['std'] = grad_test_rmse.std(axis=1)
    
    grad_train_corr['mean'] = grad_train_corr.mean(axis=1)
    grad_train_corr['std'] = grad_train_corr.std(axis=1)
    
    grad_test_corr['mean'] = grad_test_corr.mean(axis=1)
    grad_test_corr['std'] = grad_test_corr.std(axis=1)
    
    c_time = datetime.now().strftime("%Y-%m-%d %H-%M")
    base_string = f'_ntrain{N_train}_ntest{N_test}_nEpoch{nEpoch}_Nruns{no_ds}_{c_time}.csv'
    std_filename_train_rmse = f'std_train_rmse'+base_string
    std_filename_train_corr = f'std_train_corr'+base_string
    
    std_filename_test_rmse = f'std_test_rmse'+base_string
    std_filename_test_corr = f'std_test_corr'+base_string
    
    grad_filename_train_rmse = f'grad_train_rmse'+base_string
    grad_filename_train_corr = f'grad_train_corr'+base_string
    
    grad_filename_test_rmse = f'grad_test_rmse'+base_string
    grad_filename_test_corr = f'grad_test_corr'+base_string
    
    res_folder = f'complexity_test_results_ntrain{N_train}_{c_time}'
    os.mkdir(res_folder)
    std_train_rmse.to_csv(res_folder+'/'+std_filename_train_rmse)
    std_train_corr.to_csv(res_folder+'/'+std_filename_train_corr)
    std_test_rmse.to_csv(res_folder+'/'+std_filename_test_rmse)
    std_test_corr.to_csv(res_folder+'/'+std_filename_test_corr)
    
    grad_train_rmse.to_csv(res_folder+'/'+grad_filename_train_rmse)
    grad_train_corr.to_csv(res_folder+'/'+grad_filename_train_corr)
    grad_test_rmse.to_csv(res_folder+'/'+grad_filename_test_rmse)
    grad_test_corr.to_csv(res_folder+'/'+grad_filename_test_corr)
    
    # Zip up results so they can be downloaded (commented out for example NB).
#     subprocess.call(["zip", "-r", f"/content/{res_folder}.zip", f"/content/{res_folder}"])
#     files.download(f"/content/{res_folder}.zip")

    print(f'Completed. Saved results to folder {res_folder}')

In [None]:
def sub_sample_data(n_train, no_runs, all_train_data):
    
    train_data = defaultdict(dict) 
    for ds in range(no_runs):
        np.random.seed(ds)
        perm = np.random.permutation(n_train)
        train_data[ds]['simInData'] = all_train_data['simInData'].iloc[perm].reset_index(drop=True)
        train_data[ds]['simOutData'] = all_train_data['simOutData'].iloc[perm].reset_index(drop=True)
        train_data[ds]['simOutData_grad'] = all_train_data['simOutData_grad'].iloc[perm].reset_index(drop=True)

    return train_data

In [None]:
def lr_time_based_decay(epoch, lr):
    return lr * 1 / (1 + decay * n_epoch)

In [None]:
# Location of 2D data, generated by function presented in Chaper 3, Section 3.2.1
data_folder = '/content/drive/MyDrive/Colab Notebooks/rp/2D process/2D_data'

all_data_train = {}
all_data_test = {}

all_data_train['simInData'] = pd.read_csv(f'{data_folder}/x_train.csv', index_col=0)
all_data_train['simOutData'] = pd.read_csv(f'{data_folder}/y_train.csv', index_col=0)
all_data_train['simOutData_grad'] = pd.read_csv(f'{data_folder}/y_train_grad.csv', index_col=0, header=[0,1])
all_data_test['simInData'] = pd.read_csv(f'{data_folder}/x_test.csv', index_col=0)
all_data_test['simOutData'] = pd.read_csv(f'{data_folder}/y_test.csv', index_col=0)
print(f'Successfully read in dataset.')
# No need to read in test gradients

In [None]:
# Number of random draws from the training data pool to use
n_runs = 10

train_data_ss = sub_sample_data(50, n_runs, all_data_train)

In [None]:
# Folder to save models to
foldername = '/content/drive/MyDrive/2D_models/saved_models_nTrain50_lrd/saved_models_cmplx_tests'
n_epoch = 300
initial_learning_rate = 0.01
decay = initial_learning_rate / n_epoch

In [None]:
# Range of model complexities to experiment with
complexity_range = [2, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]

In [None]:
# Running this cell runs experiments
#run(50, n_epoch, n_runs, train_data_ss, all_data_test, foldername=foldername)