# Race to low rms

Import latin-hypercube test set

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import healpy as hp
import utils_intensity_map as uim
import utils_deck_generation as idg
import netcdf_read_write as nrw
import training_data_generation as tdg
import tf_neural_network as tfnn
import time
import os
import shutil
import utils_optimizers as uopt
%matplotlib inline
plt.ion()
run_dir = "Data_input"

filename_flipped_trainingdata = "flipped_training_data_and_labels.nc"
imap_nside = 256
LMAX = 30
hemisphere_symmetric = True
run_clean = True
rng = np.random.default_rng(12345)

In [None]:
sys_params = tdg.define_system_params(run_dir)
sys_params["trainingdata_filename"] = filename_flipped_trainingdata
X_all, Y_all, avg_powers_all = nrw.import_training_data(sys_params)

In [None]:
target_all = np.sqrt(np.sum(Y_all**2, axis=0))

target_mean = np.mean(target_all)
target_variance = np.sqrt(np.var(target_all))
print(target_mean*100.0, target_variance*100.0)

input_means = np.mean(X_all, axis=1)
input_standard_deviation = np.sqrt(np.var(X_all, axis=1))
print(input_means*100.0, input_standard_deviation*100.0)

In [None]:
print(np.shape(X_all[:,0]), np.shape(Y_all[:,0]))
mindex = np.argmin(np.mean(Y_all, axis=0))
print(mindex)
print(np.sum(Y_all[:,mindex]))
print(target_all[mindex])
mindex = np.argmin(target_all)
print(mindex)
print(np.sum(Y_all[:,mindex]))
print(target_all[mindex])

num_examples = np.shape(X_all)[1]
print(num_examples)
num_inputs = np.shape(X_all)[0]
num_modes = np.shape(Y_all)[0]
print(num_inputs, num_modes)

In [None]:
fig = plt.figure()
ax = plt.axes()
plt.plot(np.arange(LMAX), Y_all[:,mindex] * 100.0)
ax.set_xticks(range(0, LMAX+1, int(LMAX/5)))
plt.xlim([0, LMAX])
plt.title("Unweighted Modes")
plt.xlabel("l mode")
plt.ylabel(r"amplitude ($\%$)");

## Method 1, Brute force

In [None]:
##

## Method 2, Gradient descent

The partial derivative is determined using a 2*16=32 grid of points (2 points in every dimension) around the current minima. These points can be evaluated in either a NN or Ifriit depending on speed.

In [None]:
learn_exp = -1.0
n_iter = 10
run_dir = "Data_output"
iter_dir = "iter_"
filename_flipped_trainingdata = "flipped_training_data_and_labels.nc"
num_parallel = 17
stencil_size = num_inputs * 2 + 1
num_steps_per_iter = num_parallel - 1

In [None]:
learning_rate = 10.0**learn_exp
step_size = np.array([learn_exp - 1.0, learn_exp + 1.0])

X_old = np.zeros((num_inputs, 1))
Y_old = np.zeros((num_modes, 1))
avg_powers_old = np.array([0.0])

mindex = np.argmin(np.sqrt(np.sum(Y_all**2, axis=0)))
X_old[:,0] = X_all[:, mindex]

pbounds = np.zeros((num_inputs, 2))
pbounds[:,1] = 1.0
tic = time.perf_counter()
for ieval in range(n_iter):
    
    if (sum(abs(X_all[:,-1] - X_all[:,-2])) <= 0.0):
        learn_exp = learn_exp-0.5
        learning_rate = 10.0**(learn_exp)
        step_size = step_size - 0.5
        print("Reducing step size to: " + str(learning_rate))
        if learning_rate < 1.0e-4:
            print(str(ieval+1) + " Bayesian data points added, saving to .nc")
            print("Early stopping due to repeated results")
            filename_trainingdata = run_dir + '/' + filename_flipped_trainingdata
            nrw.save_training_data(X_all, Y_all, avg_powers_all, filename_trainingdata)
            break
    
    X_stencil = uopt.gradient_stencil(X_old, learning_rate, pbounds, num_inputs, stencil_size)
    Y_stencil, avg_powers_stencil = tdg.run_ifriit_input(stencil_size, X_stencil, run_dir, LMAX, num_parallel, hemisphere_symmetric, run_clean)
    target_stencil = np.sqrt(np.sum(Y_stencil**2, axis=0))
    mindex_stencil = np.argmin(target_stencil)
    print("The minimum in the stencil", np.min(target_stencil), mindex_stencil)
    print("The previous value was: ", target_stencil[0], 0)
    print(X_stencil[:,0])
    os.rename(run_dir + "/run_" + str(mindex_stencil), run_dir + "/" + iter_dir + str(ieval+num_examples))
    
    grad = uopt.determine_gradient(X_stencil, target_stencil, learning_rate, pbounds, num_inputs)
    X_new = uopt.grad_descent(X_old, grad, step_size, pbounds, num_inputs, num_steps_per_iter)
    
    Y_new, avg_powers_new = tdg.run_ifriit_input(num_steps_per_iter, X_new, run_dir, LMAX, num_parallel, hemisphere_symmetric, run_clean)
    target_downhill = np.sqrt(np.sum(Y_new**2, axis=0))
    mindex_downhill = np.argmin(target_downhill)
    print("The minimum downhill", np.min(target_downhill), mindex_downhill)
    
    if target_downhill[mindex_downhill] < target_stencil[mindex_stencil]:
        shutil.rmtree(run_dir + "/" + iter_dir + str(ieval+num_examples))
        os.rename(run_dir + "/run_" + str(mindex_downhill), run_dir + "/" + iter_dir + str(ieval+num_examples))
        X_old[:,0] = X_new[:,mindex_downhill]
        Y_old[:,0] = Y_new[:,mindex_downhill]
        avg_powers_old = avg_powers_new[mindex_downhill]
    else:
        X_old[:,0] = X_stencil[:,mindex_stencil]
        Y_old[:,0] = Y_stencil[:,mindex_stencil]
        avg_powers_old = avg_powers_stencil[mindex_stencil]
    
    X_all = np.hstack((X_all, X_old))
    Y_all = np.hstack((Y_all, Y_old))
    avg_powers_all = np.hstack((avg_powers_all, avg_powers_old))
    
    print("Iteration {} with learn rate {} value:{}".format(ieval, learning_rate, np.sqrt(np.sum(Y_old**2))))
    print(X_old[:,0])
    
    if (np.sqrt(np.sum(Y_all[:,-1]**2)) > np.sqrt(np.sum(Y_all[:,-2]**2))):
        print("Bug! Ascending slope!")
        print(np.sqrt(np.sum(Y_all[:,-1]**2)), np.sqrt(np.sum(Y_all[:,-2]**2)))
        break
    
    if (ieval+1)%10 <= 0.0:
        toc = time.perf_counter()
        print("{:0.4f} seconds".format(toc - tic))
        print(str(ieval+1) + " Bayesian data points added, saving to .nc")
        filename_trainingdata = run_dir + '/' + filename_flipped_trainingdata
        nrw.save_training_data(X_all, Y_all, avg_powers_all, filename_trainingdata)
        mindex = np.argmin(np.mean(Y_all, axis=0))
        print(mindex)
        print(np.sum(Y_all[:,mindex]))
        print(np.sqrt(np.sum(Y_all[:,mindex]**2)))
        mindex = np.argmin(np.sqrt(np.sum(Y_all**2, axis=0)))
        print(mindex)
        print(np.sum(Y_all[:,mindex]))
        print(np.sqrt(np.sum(Y_all[:,mindex]**2)))
for isten in range(stencil_size):
    try:
        shutil.rmtree(run_dir + "/run_" + str(isten))
    except:
        print("File: " + run_dir + "/run_" + str(isten) + ", already deleted.")

In [None]:
# Overwrite ave file up to nex
""""
nex = 10
print(np.shape(X_all[:,:nex]))
print(np.shape(Y_all[:,:nex]))
print(np.shape(avg_powers_all[:nex]))

X_all = X_all[:,:nex]
Y_all = Y_all[:,:nex]
avg_powers_all = avg_powers_all[:nex]

filename_trainingdata = run_dir + '/' + filename_flipped_trainingdata
nrw.save_training_data(X_all, Y_all, avg_powers_all, filename_trainingdata)
"""

## Method 3, Use surrogate NN to pick low RMS from random inputs

## Method 4, Use inverse NN to indentify low rms by inputing other low rms cases

## Method 5, Genetic algorithm

Iterative procedure taking best features of first generation. Mutate and mix inputs between the best and produce subsequent generation.

In [None]:
init_points = 10
n_iter = 10
num_parallel = 10
run_dir = "Data_empty"
iter_dir = "iter_"
filename_flipped_trainingdata = "flipped_training_data_and_labels.nc"

In [None]:
# Defining the population size.
pop_size = (num_inputs, init_points) # The population will have sol_per_pop chromosome where each chromosome has num_weights genes.
#Creating the initial population.
X_pop = rng.random(pop_size)
num_parents_mating = int(init_points / 10.0)
if (num_parents_mating % 2) != 0:
    num_parents_mating -=1

pbounds = np.zeros((num_inputs, 2))
pbounds[:,1] = 1.0

best_outputs = []
X_all = np.array([], dtype=np.int64).reshape(num_inputs,0)
Y_all= np.array([], dtype=np.int64).reshape(LMAX,0)
avg_powers_all = np.array([], dtype=np.int64)
tic = time.perf_counter()
for generation in range(n_iter-1):
    print("Generation : ", generation)
    # Measuring the fitness of each chromosome in the population.
    Y_pop, avg_powers_pop = tdg.run_ifriit_input(init_points, X_pop, run_dir, LMAX, num_parallel, hemisphere_symmetric, run_clean)
    X_all = np.hstack((X_all, X_pop))
    Y_all = np.hstack((Y_all, Y_pop))
    avg_powers_all = np.hstack((avg_powers_all, avg_powers_pop))
    for irun in range(init_points):
        os.rename(run_dir + "/run_" + str(irun), run_dir + "/" + iter_dir + str(irun+init_points*generation))
        
    fitness_pop = -np.sqrt(np.sum(Y_pop**2, axis=0))
    mindex_pop = np.argmax(fitness_pop)
    
    if (generation+1)%10 <= 0.0:
        toc = time.perf_counter()
        print("{:0.4f} seconds".format(toc - tic))
        print(str(generation+1) + " genetic algorithm data points added, saving to .nc")
        filename_trainingdata = run_dir + '/' + filename_flipped_trainingdata
        nrw.save_training_data(X_all, Y_all, avg_powers_all, filename_trainingdata)
        mindex = np.argmin(np.mean(Y_all, axis=0))
        print(mindex)
        print(np.sum(Y_all[:,mindex]))
        print(np.sqrt(np.sum(Y_all[:,mindex]**2)))
        mindex = np.argmin(np.sqrt(np.sum(Y_all**2, axis=0)))
        print(mindex)
        print(np.sum(Y_all[:,mindex]))
        print(np.sqrt(np.sum(Y_all[:,mindex]**2)))

    best_outputs.append(np.max(fitness_pop))
    # The best result in the current iteration.
    print("Best result : ", best_outputs[generation])

    # Selecting the best parents in the population for mating.
    parents = uopt.select_mating_pool(X_pop.T, fitness_pop, num_parents_mating)

    # Generating next generation using crossover.
    offspring_crossover = uopt.crossover(parents, offspring_size=(init_points-num_parents_mating, num_inputs))

    # Adding some variations to the offspring using mutation.
    offspring_mutation = uopt.mutation(offspring_crossover, rng, pbounds, num_mutations=2)

    # Creating the new population based on the parents and offspring.
    X_pop[:,0:num_parents_mating] = parents.T
    X_pop[:,num_parents_mating:] = offspring_mutation.T
# Getting the best solution after iterating finishing all generations.
#At first, the fitness is calculated for each solution in the final generation.
Y_pop, avg_powers_pop = tdg.run_ifriit_input(init_points, X_pop, run_dir, LMAX, num_parallel, hemisphere_symmetric, run_clean)
X_all = np.hstack((X_all, X_pop))
Y_all = np.hstack((Y_all, Y_pop))
avg_powers_all = np.hstack((avg_powers_all, avg_powers_pop))
for irun in range(init_points):
    os.rename(run_dir + "/run_" + str(irun), run_dir + "/" + iter_dir + str(irun+init_points*(generation+1)))

# Then return the index of that solution corresponding to the best fitness.
fitness_pop = -np.sqrt(np.sum(Y_pop**2, axis=0))
mindex_pop = np.argmax(fitness_pop)

print("Best solution : ", X_pop[:, mindex_pop])
print("Best solution fitness : ", fitness_pop[mindex_pop])

toc = time.perf_counter()
print("{:0.4f} seconds".format(toc - tic))
print(str(generation+1) + " genetic algorithm data points added, saving to .nc")
filename_trainingdata = run_dir + '/' + filename_flipped_trainingdata
nrw.save_training_data(X_all, Y_all, avg_powers_all, filename_trainingdata)
mindex = np.argmin(np.mean(Y_all, axis=0))
print(mindex)
print(np.sum(Y_all[:,mindex]))
print(np.sqrt(np.sum(Y_all[:,mindex]**2)))
mindex = np.argmin(np.sqrt(np.sum(Y_all**2, axis=0)))
print(mindex)
print(np.sum(Y_all[:,mindex]))
print(np.sqrt(np.sum(Y_all[:,mindex]**2)))

#plt.plot(best_outputs)
#plt.xlabel("Iteration")
#plt.ylabel("Fitness")
#plt.show()

## Method 6, Bayesian optimization (ifriit is high quality source and NN is low quality)

Gaussian process surrogate and bayesian optimization used with multiple sources of information. First we create a bayesian model with the true data points and select new simulations based on that. The model (Kriging method?) could use a "gaussian process approximation" to reduce computational expense.

In [None]:
init_points = num_examples
n_iter = 10
run_dir = "Data_output"
iter_dir = "iter_"
filename_flipped_trainingdata = "flipped_training_data_and_labels.nc"
num_parallel = 10 # Not currently run parallel

In [None]:
pbounds = {}
for ii in range(num_inputs):
    pbounds["x"+str(ii)] = (0., 1.)

target = -np.sqrt(np.sum(Y_all**2, axis=0)) # Critical to make negative (min not max)
print(num_inputs, init_points, np.shape(target))

optimizer, utility = uopt.initialize_unknown_func(X_all, target, pbounds, init_points, num_inputs)
print(optimizer.max)

In [None]:
start = 0

tic = time.perf_counter()
for ieval in range(start, n_iter):
    next_point = optimizer.suggest(utility)

    X_new = np.zeros((num_inputs, 1))
    for ii in range(num_inputs):
        X_new[ii] = next_point["x"+str(ii)]
    
    Y_new, avg_powers_new = tdg.run_ifriit_input(1, X_new, run_dir, LMAX, num_parallel, hemisphere_symmetric, run_clean)
    
    X_all = np.hstack((X_all, X_new))
    Y_all = np.hstack((Y_all, Y_new))
    avg_powers_all = np.hstack((avg_powers_all, avg_powers_new))
    
    os.rename(run_dir + "/run_0", run_dir + "/" + iter_dir + str(ieval+num_examples))
    
    #target = black_box_function(**next_point)
    target = -np.sqrt(np.sum(Y_new**2))
    try:
        optimizer.register(params=next_point, target=target)
    except:
        print("Broken input!", next_point, target)
    if (ieval+1)%10 <= 0.0:
        toc = time.perf_counter()
        print("{:0.4f} seconds".format(toc - tic))
        print(str(ieval+1) + " Bayesian data points added, saving to .nc")
        filename_trainingdata = run_dir + '/' + filename_flipped_trainingdata
        nrw.save_training_data(X_all, Y_all, avg_powers_all, filename_trainingdata)
        print(optimizer.max)
        mindex = np.argmin(np.mean(Y_all, axis=0))
        print(mindex)
        print(np.sum(Y_all[:,mindex]))
        print(np.sqrt(np.sum(Y_all[:,mindex]**2)))
        mindex = np.argmin(np.sqrt(np.sum(Y_all**2, axis=0)))
        print(mindex)
        print(np.sum(Y_all[:,mindex]))
        print(np.sqrt(np.sum(Y_all[:,mindex]**2)))
print(next_point)

## Method 7, Grid search algorithm

Split the entire search space into a grid (start coarse 2 or 3 cells per dimension) 3^16 = 43M. Evaluate each cell depending on the data points within or 8 nearest neighbours.

## Method 8, Network search algorithm

Find a gradient between all data points, use this information to initialize gradient descent

## Method 9, Principle Component Analysis (PCA)

Combine with gradient descent for faster convergence? Enables plotting of dataset in 2D

## Method 10, Transfer Learning 

Generate low quality large dataset (1-50M examples?) using surrogate NN and use this for transfer learning. This might help to evaluate at what stage transfer learning becomes effective (can we use it with a dataset of 1000 or 10000?)