## This notebook shows how to do the following tasks:

### C. Train a predictor (surrogate)
### D. Compute MSE on test set

## C. Train a predictor (surrogate)

In [20]:
from functions import *

df = pd.read_csv('ABM_eval_Sobol_Islands_ss200_MC50').iloc[:, 1:]

Imported successfully


In [27]:
test_set = build_Xy_table(islands, 20, 10, ot.MonteCarloExperiment, 'MC_test_set', to_file = False)


In [31]:
X, Y = df.iloc[:, :7], df.iloc[:, 7:].mean(axis = 1)

# Kriging Surrogate
surrogate_model = GaussianProcessRegressor(random_state=0).fit(X, Y)

#  XGBoost Surrogate
# surrogate_model_XGB = fit_surrogate_model(X, Y)

test_set_X, test_set_Y = test_set.iloc[:, :7], test_set.iloc[:, 7:].mean(axis = 1)
predicted_Y = surrogate_model.predict(test_set_X.as_matrix())


In [5]:
df = pd.read_csv('ABM_eval_MC_Islands_ss20_MC10')

In [40]:
predicted_Y - test_set_Y.values
# test_set_Y.values

array([  1.86315520e+07,   3.40910250e+05,   4.26360160e+07,
         4.76392800e+06,  -4.42544000e+05,  -8.44841600e+06,
         1.89342750e+05,  -2.41632000e+05,   1.14364077e+10,
        -1.31816000e+06,  -3.54310400e+06,   5.78746400e+06,
        -1.47785300e+06,   1.88317440e+07,   1.20091812e+06,
        -7.63596000e+05,  -6.21594600e+06,  -1.38568750e+04,
         5.55430759e+16,   2.19952480e+07])

In [21]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import openturns as ot
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np

# load ABM file
import islands

def build_Xy_table(ABM, size_sample, size_MC, SequenceFunction, SequenceFunctionString, to_file = True, **kwargs):
    
    problem_ABM = ABM.problem()

    SequenceFunction = ot.SobolSequence(problem_ABM['num_vars'], **kwargs)

    set_X = rescale_sample(np.array(SequenceFunction.generate(size_sample)), problem_ABM['bounds'])
    
    # Evaluate
    set_y = [np.array(Parallel(n_jobs=-1)(delayed(ABM.model)(p) for p in set_X)) for i in range(size_MC)]
    
    # as DataFrame
    df = pd.DataFrame(set_X, columns = problem_ABM['names']).join(pd.DataFrame(set_y).T.add_prefix('evaluation_'))
    
    if to_file:
        # save file
        filename = 'ABM_eval_'+SequenceFunctionString+'_'+problem_ABM['abm_name']+'_ss'+str(size_sample)+'_MC'+str(size_MC)
        df.to_csv(filename, index = False)
        print 'Saved file '+filename
    else:
        return df
    
def rescale_sample(sample, bounds):
    d = [b[1] - b[0] for b in bounds]
    m = [min(b) for b in bounds]
    rescaled_sample = m + (d * sample)
    return rescaled_sample