# Race to low rms

Import latin-hypercube test set

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import healpy as hp
import utils_intensity_map as uim
import utils_deck_generation as idg
import netcdf_read_write as nrw
import training_data_generation as tdg
import tf_neural_network as tfnn
import neural_network_generation as nng
import time
import os
import shutil
%matplotlib inline
plt.ion()
run_dir = "Data_input"
num_nn = 1

filename_flipped_trainingdata = "flipped_training_data_and_labels.nc"
imap_nside = 256
LMAX = 30
hemisphere_symmetric = True

In [None]:
sys_params = tdg.define_system_params(run_dir)
nn_params = nng.define_nn_params(num_nn)
sys_params["trainingdata_filename"] = filename_flipped_trainingdata
X_all, Y_all, avg_powers_all, nn_params = nng.import_training_data(nn_params, sys_params)

In [None]:
print(np.shape(X_all[:,0]), np.shape(Y_all[:,0]))
mindex = np.argmin(np.mean(Y_all, axis=0))
print(mindex)
print(np.sum(Y_all[:,mindex]))
print(np.sqrt(np.sum(Y_all[:,mindex]**2)))
mindex = np.argmin(np.sqrt(np.sum(Y_all**2, axis=0)))
print(mindex)
print(np.sum(Y_all[:,mindex]))
print(np.sqrt(np.sum(Y_all[:,mindex]**2)))

num_examples = np.shape(X_all)[1]
print(num_examples)
num_inputs = np.shape(X_all)[0]
num_modes = np.shape(Y_all)[0]
print(num_inputs, num_modes)

In [None]:
fig = plt.figure()
ax = plt.axes()
plt.plot(np.arange(LMAX), Y_all[:,mindex] * 100.0)
ax.set_xticks(range(0, LMAX+1, int(LMAX/5)))
plt.xlim([0, LMAX])
plt.title("Unweighted Modes")
plt.xlabel("l mode")
plt.ylabel(r"amplitude ($\%$)");

In [None]:
def run_ifriit_input(num_examples, X_all, run_dir, LMAX, num_nn, num_parallel, hemisphere_symmetric):
    dataset_params, facility_spec = tdg.define_dataset_params(num_examples)
    dataset_params["hemisphere_symmetric"] = hemisphere_symmetric
    dataset_params["Y_train"] = X_all

    sys_params = tdg.define_system_params(run_dir)
    sys_params["num_processes"] = num_parallel
    sys_params["run_clean"] = True # Create new run files
    
    dataset_params = idg.create_run_files(dataset_params, sys_params, facility_spec)
    tdg.generate_training_data(dataset_params, sys_params, facility_spec)
    
    nn_params = nng.define_nn_params(num_nn)
    X_all, Y_all, avg_powers_all, nn_params = nng.import_training_data_reversed(nn_params, sys_params, LMAX)
    return Y_all, avg_powers_all

## Method 1, Brute force

In [None]:
##

## Method 2, Gradient descent

The partial derivative is determined using a 2*16=32 grid of points (2 points in every dimension) around the current minima. These points can be evaluated in either a NN or Ifriit depending on speed.

In [None]:
def gradient_stencil(X_new, learning_rate, pbounds, num_inputs, stencil_size):
    X_stencil = np.zeros((num_inputs, stencil_size))
    
    counter = 0
    X_stencil[:, counter] = X_new[:,0]
    counter += 1
    for ii in range(num_inputs):
        X_stencil[:, counter] = X_new[:,0]
        X_stencil[ii, counter] = X_new[ii,0] - learning_rate
        if (X_stencil[ii,counter] < pbounds[ii,0]):
            X_stencil[ii,counter] = pbounds[ii,0] # to avoid stencil leaving domain
        counter += 1
        X_stencil[:, counter] = X_new[:,0]
        X_stencil[ii, counter] = X_new[ii,0] + learning_rate
        if (X_stencil[ii,counter] > pbounds[ii,1]):
            X_stencil[ii,counter] = pbounds[ii,1] # to avoid stencil leaving domain
        counter += 1

    return X_stencil



def determine_gradient(X_stencil, target, learning_rate, pbounds, num_inputs):

    grad = np.zeros(num_inputs)
    counter = 0
    f_centre = target[counter]
    counter += 1
    for ii in range(num_inputs):

        centred_diff = True
        forward_diff = False
        backward_diff = False
        
        if (X_stencil[ii,counter] < pbounds[ii,0]):
            centred_diff = False
            forward_diff = True 
        else:
            f_minus = target[counter]
        counter += 1
        
        if (X_stencil[ii,counter] > pbounds[ii,1]):
            centred_diff = False
            backward_diff = True
        else:
            f_plus = target[counter]
        counter += 1
        
        if centred_diff:
            grad[ii] = (f_plus - f_minus) / (2.0 * learning_rate)
        elif forward_diff:
            grad[ii] = (f_plus - f_centre) / learning_rate
        elif backward_diff:
            grad[ii] = (f_centre - f_minus) / learning_rate
        else:
            grad[ii] = 0.0
            print("Broken gradients!")

    return grad



def grad_descent(X_old, grad, step_size, pbounds, num_inputs, num_steps_per_iter):
    
    learning_rates = np.logspace(step_size[0], step_size[1], num_steps_per_iter)
    X_new = np.zeros((num_inputs, num_steps_per_iter))
    for ieval in range(num_steps_per_iter):
        X_new[:,ieval] = X_old[:,0]
        for ii in range(num_inputs):
            X_new[ii,ieval] = X_old[ii,0] - learning_rates[ieval] * grad[ii]
            if (X_new[ii,ieval] < pbounds[ii,0]):
                X_new[ii,ieval] = pbounds[ii,0]
            elif (X_new[ii,ieval] > pbounds[ii,1]):
                X_new[ii,ieval] = pbounds[ii,1]

    return X_new

In [None]:
learn_exp = -1.0
n_iter = 10
run_dir = "Data_output"
iter_dir = "iter_"
filename_flipped_trainingdata = "flipped_training_data_and_labels.nc"
num_parallel = 17
stencil_size = num_inputs * 2 + 1
num_steps_per_iter = num_parallel - 1

In [None]:
learning_rate = 10.0**learn_exp
step_size = np.array([learn_exp - 1.0, learn_exp + 1.0])

X_old = np.zeros((num_inputs, 1))
Y_old = np.zeros((num_modes, 1))
avg_powers_old = np.array([0.0])

mindex = np.argmin(np.sqrt(np.sum(Y_all**2, axis=0)))
X_old[:,0] = X_all[:, mindex]

pbounds = np.zeros((num_inputs, 2))
pbounds[:,1] = 1.0
tic = time.perf_counter()
for ieval in range(n_iter):
    
    if (sum(abs(X_all[:,-1] - X_all[:,-2])) <= 0.0):
        learn_exp = learn_exp-0.5
        learning_rate = 10.0**(learn_exp)
        step_size = step_size - 0.5
        print("Reducing step size to: " + str(learning_rate))
        if learning_rate < 1.0e-4:
            print(str(ieval+1) + " Bayesian data points added, saving to .nc")
            print("Early stopping due to repeated results")
            filename_trainingdata = run_dir + '/' + filename_flipped_trainingdata
            nrw.save_training_data(X_all, Y_all, avg_powers_all, filename_trainingdata)
            break
    
    X_stencil = gradient_stencil(X_old, learning_rate, pbounds, num_inputs, stencil_size)
    Y_stencil, avg_powers_stencil = run_ifriit_input(stencil_size, X_stencil, run_dir, LMAX, num_nn, num_parallel, hemisphere_symmetric)
    target_stencil = np.sqrt(np.sum(Y_stencil**2, axis=0))
    mindex_stencil = np.argmin(target_stencil)
    print("The minimum in the stencil", np.min(target_stencil), mindex_stencil)
    print("The previous value was: ", target_stencil[0], 0)
    print(X_stencil[:,0])
    os.rename(run_dir + "/run_" + str(mindex_stencil), run_dir + "/" + iter_dir + str(ieval))
    
    grad = determine_gradient(X_stencil, target_stencil, learning_rate, pbounds, num_inputs)
    X_new = grad_descent(X_old, grad, step_size, pbounds, num_inputs, num_steps_per_iter)
    
    Y_new, avg_powers_new = run_ifriit_input(num_steps_per_iter, X_new, run_dir, LMAX, num_nn, num_parallel, hemisphere_symmetric)
    target_downhill = np.sqrt(np.sum(Y_new**2, axis=0))
    mindex_downhill = np.argmin(target_downhill)
    print("The minimum downhill", np.min(target_downhill), mindex_downhill)
    
    if target_downhill[mindex_downhill] < target_stencil[mindex_stencil]:
        shutil.rmtree(run_dir + "/" + iter_dir + str(ieval))
        os.rename(run_dir + "/run_" + str(mindex_downhill), run_dir + "/" + iter_dir + str(ieval))
        X_old[:,0] = X_new[:,mindex_downhill]
        Y_old[:,0] = Y_new[:,mindex_downhill]
        avg_powers_old = avg_powers_new[mindex_downhill]
    else:
        X_old[:,0] = X_stencil[:,mindex_stencil]
        Y_old[:,0] = Y_stencil[:,mindex_stencil]
        avg_powers_old = avg_powers_stencil[mindex_stencil]
    
    X_all = np.hstack((X_all, X_old))
    Y_all = np.hstack((Y_all, Y_old))
    avg_powers_all = np.hstack((avg_powers_all, avg_powers_old))
    
    print("Iteration {} with learn rate {} value:{}".format(ieval, learning_rate, np.sqrt(np.sum(Y_old**2))))
    print(X_old[:,0])
    
    if (np.sqrt(np.sum(Y_all[:,-1]**2)) > np.sqrt(np.sum(Y_all[:,-2]**2))):
        print("Bug! Ascending slope!")
        print(np.sqrt(np.sum(Y_all[:,-1]**2)), np.sqrt(np.sum(Y_all[:,-2]**2)))
        break
    
    if (ieval+1)%10 <= 0.0:
        toc = time.perf_counter()
        print("{:0.4f} seconds".format(toc - tic))
        print(str(ieval+1) + " Bayesian data points added, saving to .nc")
        filename_trainingdata = run_dir + '/' + filename_flipped_trainingdata
        nrw.save_training_data(X_all, Y_all, avg_powers_all, filename_trainingdata)
        mindex = np.argmin(np.mean(Y_all, axis=0))
        print(mindex)
        print(np.sum(Y_all[:,mindex]))
        print(np.sqrt(np.sum(Y_all[:,mindex]**2)))
        mindex = np.argmin(np.sqrt(np.sum(Y_all**2, axis=0)))
        print(mindex)
        print(np.sum(Y_all[:,mindex]))
        print(np.sqrt(np.sum(Y_all[:,mindex]**2)))
for isten in range(stencil_size):
    try:
        shutil.rmtree(run_dir + "/run_" + str(isten))
    except:
        print("File: " + run_dir + "/run_" + str(isten) + ", already deleted.")

In [None]:
# Overwrite ave file up to nex
""""
nex = 10
print(np.shape(X_all[:,:nex]))
print(np.shape(Y_all[:,:nex]))
print(np.shape(avg_powers_all[:nex]))

X_all = X_all[:,:nex]
Y_all = Y_all[:,:nex]
avg_powers_all = avg_powers_all[:nex]

filename_trainingdata = run_dir + '/' + filename_flipped_trainingdata
nrw.save_training_data(X_all, Y_all, avg_powers_all, filename_trainingdata)
"""

## Method 3, Use surrogate NN to pick low RMS from random inputs

## Method 4, Use inverse NN to indentify low rms by inputing other low rms cases

## Method 5, Genetic algorithm

Iterative procedure taking best features of first generation. Mutate and mix inputs between the best and produce subsequent generation.

## Method 6, Bayesian optimization (ifriit is high quality source and NN is low quality)

Gaussian process surrogate and bayesian optimization used with multiple sources of information. First we create a bayesian model with the true data points and select new simulations based on that. The model (Kriging method?) could use a "gaussian process approximation" to reduce computational expense.

In [None]:
import wrapper_bayesian_optimizer as wbo

init_points = num_examples
n_iter = 10
run_dir = "Data_output"
iter_dir = "iter_"
filename_flipped_trainingdata = "flipped_training_data_and_labels.nc"
num_parallel = 10 # Not currently run parallel

In [None]:
pbounds = {}
for ii in range(num_inputs):
    pbounds["x"+str(ii)] = (0., 1.)

target = -np.sqrt(np.sum(Y_all**2, axis=0)) # Critical to make negative (min not max)
print(num_inputs, init_points, np.shape(target))

optimizer, utility = wbo.initialize_unknown_func(X_all, target, pbounds, init_points, num_inputs)
print(optimizer.max)

In [None]:
start = 0

tic = time.perf_counter()
for ieval in range(start, n_iter):
    next_point = optimizer.suggest(utility)

    X_new = np.zeros((num_inputs, 1))
    for ii in range(num_inputs):
        X_new[ii] = next_point["x"+str(ii)]
    
    Y_new, avg_powers_new = run_ifriit_input(1, X_new, run_dir, LMAX, num_nn, num_parallel, hemisphere_symmetric)
    
    X_all = np.hstack((X_all, X_new))
    Y_all = np.hstack((Y_all, Y_new))
    avg_powers_all = np.hstack((avg_powers_all, avg_powers_new))
    
    os.rename(run_dir + "/run_0", run_dir + "/" + iter_dir + str(ieval))
    
    #target = black_box_function(**next_point)
    target = -np.sqrt(np.sum(Y_new**2))
    try:
        optimizer.register(params=next_point, target=target)
    except:
        print("Broken input!", next_point, target)
    if (ieval+1)%10 <= 0.0:
        toc = time.perf_counter()
        print("{:0.4f} seconds".format(toc - tic))
        print(str(ieval+1) + " Bayesian data points added, saving to .nc")
        filename_trainingdata = run_dir + '/' + filename_flipped_trainingdata
        nrw.save_training_data(X_all, Y_all, avg_powers_all, filename_trainingdata)
        print(optimizer.max)
        mindex = np.argmin(np.mean(Y_all, axis=0))
        print(mindex)
        print(np.sum(Y_all[:,mindex]))
        print(np.sqrt(np.sum(Y_all[:,mindex]**2)))
        mindex = np.argmin(np.sqrt(np.sum(Y_all**2, axis=0)))
        print(mindex)
        print(np.sum(Y_all[:,mindex]))
        print(np.sqrt(np.sum(Y_all[:,mindex]**2)))
print(next_point)

## Method 7, Grid search algorithm

Split the entire search space into a grid (start coarse 2 or 3 cells per dimension) 3^16 = 43M. Evaluate each cell depending on the data points within or 8 nearest neighbours.

## Method 8, Network search algorithm

Find a gradient between all data points, use this information to initialize gradient descent

## Method 9, Principle Component Analysis (PCA)

Combine with gradient descent for faster convergence? Enables plotting of dataset in 2D

## Method 10, Transfer Learning 

Generate low quality large dataset (1-50M examples?) using surrogate NN and use this for transfer learning. This might help to evaluate at what stage transfer learning becomes effective (can we use it with a dataset of 1000 or 10000?)