# Race to low rms

Import latin-hypercube test set

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import healpy as hp
import utils_intensity_map as uim
import utils_deck_generation as idg
import netcdf_read_write as nrw
import training_data_generation as tdg
import tf_neural_network as tfnn
import neural_network_generation as nng
import time
import os
import shutil
%matplotlib inline
plt.ion()
run_dir = "Data_221028_symmetric_hemispheres"
num_nn = 1

filename_flipped_trainingdata = "flipped_training_data_and_labels.nc"
imap_nside = 256
LMAX = 30

In [None]:
sys_params = tdg.define_system_params(run_dir)
nn_params = nng.define_nn_params(num_nn)
sys_params["trainingdata_filename"] = filename_flipped_trainingdata
X_all, Y_all, avg_powers_all, nn_params = nng.import_training_data(nn_params, sys_params)
#nn_dataset = nng.seperate_test_set(X_all, Y_all, avg_powers_all, nn_params)
num_examples = np.shape(X_all)[1]
print(num_examples)
num_inputs = np.shape(X_all)[0]

In [None]:
print(X_all[:,0], Y_all[:,0])
mindex = np.argmin(np.mean(Y_all, axis=0))
print(mindex)
print(np.sum(Y_all[:,mindex]))
print(np.sqrt(np.sum(Y_all[:,mindex]**2)))
mindex = np.argmin(np.sqrt(np.sum(Y_all**2, axis=0)))
print(mindex)
print(np.sum(Y_all[:,mindex]))
print(np.sqrt(np.sum(Y_all[:,mindex]**2)))

In [None]:
fig = plt.figure()
ax = plt.axes()
plt.plot(np.arange(LMAX), Y_all[:,mindex] * 100.0)
ax.set_xticks(range(0, LMAX+1, int(LMAX/5)))
plt.xlim([0, LMAX])
plt.title("Unweighted Modes")
plt.xlabel("l mode")
plt.ylabel(r"amplitude ($\%$)");
#plt.savefig(sys_params["figure_location"]+"/unweighted_modes_"+str(mindex)+"original" + sys_params["plot_file_type"], dpi=300, bbox_inches='tight')


In [None]:
def run_ifriit_input(num_examples, X_all, run_dir, LMAX, num_nn):
    dataset_params, facility_spec = tdg.define_dataset_params(num_examples)
    dataset_params["Y_train"] = X_all

    sys_params = tdg.define_system_params(run_dir)

    # Create new run files
    sys_params["run_clean"] = False
    dataset_params = idg.create_run_files(dataset_params, sys_params, facility_spec)
    tdg.generate_training_data(dataset_params, sys_params, facility_spec)
    
    nn_params = nng.define_nn_params(num_nn)
    X_all, Y_all, avg_powers_all, nn_params = nng.import_training_data_reversed(nn_params, sys_params, LMAX)
    return Y_all, avg_powers_all

## Method 1, Brute force

In [None]:
##

## Method 2, Gradient descent

The partial derivative is determined using a 2*16=32 grid of points (2 points in every dimension) around the current minima. These points can be evaluated in either a NN or Ifriit depending on speed.

In [None]:
def determine_gradient(point_dict, learning_rate, pbounds, num_inputs):
    learning_rate = learning_rate
    counter = 0

    point_neighbours = point_dict
    grad = {}
    for ii in range(num_inputs):
        centred_diff = True
        forward_diff = False
        backward_diff = False

        point_neighbours["x"+str(ii)] = point_dict["x"+str(ii)] - learning_rate
        if (point_neighbours["x"+str(ii)] < pbounds["x"+str(ii)][0]) or (point_neighbours["x"+str(ii)] > pbounds["x"+str(ii)][1]):
            centred_diff = False
            forward_diff = True
        else:
            f_minus = black_box_function(**point_neighbours)
        counter += 1

        point_neighbours["x"+str(ii)] = point_dict["x"+str(ii)] + learning_rate
        if (point_neighbours["x"+str(ii)] < pbounds["x"+str(ii)][0]) or (point_neighbours["x"+str(ii)] > pbounds["x"+str(ii)][1]):
            centred_diff = False
            backward_diff = True
        else:
            f_plus = black_box_function(**point_neighbours)
        counter += 1
        f_plus = black_box_function(**point_dict)

        if centred_diff:
            grad["x"+str(ii)] = (f_plus - f_minus) / (2.0 * learning_rate)
        elif forward_diff:
            grad["x"+str(ii)] = (f_plus - f_centre) / learning_rate
        elif backward_diff:
            grad["x"+str(ii)] = (f_centre - f_minus) / learning_rate
        else:
            grad["x"+str(ii)] = 0.0
            print("Broken gradients!")

    return grad



def grad_descent(start_point, grad, learning_rate, pbounds, num_inputs):
    next_point = start_point
    for ii in range(num_inputs):
        next_point["x"+str(ii)] = start_point["x"+str(ii)] - learning_rate * grad["x"+str(ii)]
        if (next_point["x"+str(ii)] < pbounds["x"+str(ii)][0]):
            next_point["x"+str(ii)] = pbounds["x"+str(ii)][0]
        elif (next_point["x"+str(ii)] > pbounds["x"+str(ii)][1]):
            next_point["x"+str(ii)] = pbounds["x"+str(ii)][1]

    return next_point

In [None]:
learning_rate = 0.01
n_iter = 2

In [None]:
mindex = np.argmin(np.sqrt(np.sum(Y_all**2, axis=0)))
next_point = {}
for ii in range(num_inputs):
    next_point["x"+str(ii)] = X_all[ii, mindex]

save_points = [None] * n_iter
stencil_size = num_inputs * 2 + 1
for ieval in range(n_iter):
    X_stencil = np.zeros((num_inputs, stencil_size))
    
    gradient_stencil()
    Y_stencil, avg_powers_stencil = run_ifriit_input(stencil_size, X_stencil, run_dir, LMAX, num_nn)
    grad = determine_gradient(next_point, learning_rate, pbounds, num_inputs)
    
    next_point = grad_descent(next_point, grad, learning_rate, pbounds, num_inputs)
    Y_new, avg_powers_stencil = run_ifriit_input(1, X_new, run_dir, LMAX, num_nn)
    X_all = np.hstack((X_all, X_new))
    Y_all = np.hstack((Y_all, Y_new))
    avg_powers_all = np.hstack((avg_powers_all, avg_powers_new))
    save_points[ieval] = next_point
    print("Iteration {} value:{}".format(ieval, black_box_function(**next_point)))

## Method 3, Use surrogate NN to pick low RMS from random inputs

## Method 4, Use inverse NN to indentify low rms by inputing other low rms cases

## Method 5, Genetic algorithm

Iterative procedure taking best features of first generation. Mutate and mix inputs between the best and produce subsequent generation.

## Method 6, Bayesian optimization (ifriit is high quality source and NN is low quality)

Gaussian process surrogate and bayesian optimization used with multiple sources of information. First we create a bayesian model with the true data points and select new simulations based on that. The model (Kriging method?) could use a "gaussian process approximation" to reduce computational expense.

In [None]:
import wrapper_bayesian_optimizer as wbo

init_points = num_examples
n_iter = 1000
run_dir = "Data_221102a_symhem_1kex_bo"
iter_dir = "iter_"
filename_flipped_trainingdata = "flipped_training_data_and_labels.nc"

In [None]:
pbounds = {}
for ii in range(num_inputs):
    pbounds["x"+str(ii)] = (0., 1.)

target = -np.sqrt(np.sum(Y_all**2, axis=0)) # Critical to make negative (min not max)
print(num_inputs, init_points, np.shape(target))

optimizer, utility = wbo.initialize_unknown_func(X_all, target, pbounds, init_points, num_inputs)
print(optimizer.max)

In [None]:
start = 0

tic = time.perf_counter()
for ieval in range(start, n_iter):
    next_point = optimizer.suggest(utility)

    X_new = np.zeros((num_inputs, 1))
    for ii in range(num_inputs):
        X_new[ii] = next_point["x"+str(ii)]
    
    Y_new, avg_powers_new = run_ifriit_input(1, X_new, run_dir, LMAX, num_nn)
    
    X_all = np.hstack((X_all, X_new))
    Y_all = np.hstack((Y_all, Y_new))
    avg_powers_all = np.hstack((avg_powers_all, avg_powers_new))
    
    os.rename(run_dir + "/run_0", run_dir + "/" + iter_dir + str(ieval))
    
    #target = black_box_function(**next_point)
    target = -np.sqrt(np.sum(Y_new**2))
    try:
        optimizer.register(params=next_point, target=target)
    except:
        print("Broken input!", next_point, target)
    if (ieval+1)%10 <= 0.0:
        toc = time.perf_counter()
        print("{:0.4f} seconds".format(toc - tic))
        print(str(ieval+1) + " Bayesian data points added, saving to .nc")
        filename_trainingdata = run_dir + '/' + filename_flipped_trainingdata
        nrw.save_training_data(X_all, Y_all, avg_powers_all, filename_trainingdata)
        print(optimizer.max)
        mindex = np.argmin(np.mean(Y_all, axis=0))
        print(mindex)
        print(np.sum(Y_all[:,mindex]))
        print(np.sqrt(np.sum(Y_all[:,mindex]**2)))
        mindex = np.argmin(np.sqrt(np.sum(Y_all**2, axis=0)))
        print(mindex)
        print(np.sum(Y_all[:,mindex]))
        print(np.sqrt(np.sum(Y_all[:,mindex]**2)))
print(next_point)

In [None]:
next_point = optimizer.suggest(utility)
print(next_point)

In [None]:
add_extra_point = False
ieval = 0

if add_extra_point:
    sys_params = tdg.define_system_params(run_dir)

    # Create new run files
    sys_params["run_clean"] = False
    nn_params = nng.define_nn_params(num_nn)
    X_new, Y_new, avg_powers_all, nn_params = nng.import_training_data_reversed(nn_params, sys_params, LMAX)

    os.rename(run_dir + "/run_0", run_dir + "/" + iter_dir + str(ieval))

    target = -np.sqrt(np.sum(Y_new**2))
    try:
        optimizer.register(params=next_point, target=target)
    except:
        print("Broken input!", next_point, target)

In [None]:
print(len(optimizer.res))
for i in range(len(optimizer.res)):
    print("Iteration {}: \n\t{}".format(i, optimizer.res[i]))
    if i>5:
        break


In [None]:
print(optimizer.max)
print(X_all[:,0], Y_all[:,0])
mindex = np.argmin(np.mean(Y_all, axis=0))
print(mindex)
print(np.sum(Y_all[:,mindex]))
print(np.sqrt(np.sum(Y_all[:,mindex]**2)))
mindex = np.argmin(np.sqrt(np.sum(Y_all**2, axis=0)))
print(mindex)
print(np.sum(Y_all[:,mindex]))
print(np.sqrt(np.sum(Y_all[:,mindex]**2)))

## Method 7, Grid search algorithm

Split the entire search space into a grid (start coarse 2 or 3 cells per dimension) 3^16 = 43M. Evaluate each cell depending on the data points within or 8 nearest neighbours.

## Method 8, Network search algorithm

Find a gradient between all data points, use this information to initialize gradient descent

## Method 9, Principle Component Analysis (PCA)

Combine with gradient descent for faster convergence? Enables plotting of dataset in 2D

## Method 10, Transfer Learning 

Generate low quality large dataset (1-50M examples?) using surrogate NN and use this for transfer learning. This might help to evaluate at what stage transfer learning becomes effective (can we use it with a dataset of 1000 or 10000?)