<a href="https://colab.research.google.com/github/Alsr96/LMMAES/blob/master/LMMAES_evo_paths.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import keras
from keras import backend as K

class structtype():
    pass

def nw_to_vec(model,layer_idx=None):
    n_layers = len(model.layers)
    vector=np.empty((0,))
    ind=np.zeros((1,))
    sum_i=0
    if layer_idx==None:
        idx=range(n_layers)
    else:
        idx=layer_idx
    for i in idx:
        if len(model.layers[i].get_weights())==2:
            weights, biases = model.layers[i].get_weights()
            s_w=np.size(weights)
            sum_i=sum_i+s_w
            ind=np.append(ind,sum_i)
            w_v=np.reshape(weights,(s_w,))
            s_b=np.size(biases)
            sum_i=sum_i+s_b
            ind=np.append(ind,sum_i)
            b_v=np.reshape(biases,(s_b,))
            wb=np.append(w_v,b_v)
            vector=np.append(vector,wb)
    return vector, ind

def vec_to_nw(vector,ind,model,layer_idx=None):
    n_layers = len(model.layers)
    if layer_idx==None:
        idx=range(n_layers)
    else:
        idx=layer_idx
    k=0
    for i in idx:
        if len(model.layers[i].get_weights())==2:
            weights,biases=model.layers[i].get_weights()
            j1=k
            j2=k+1
            j3=k+2
            weights=np.reshape(vector[int(ind[j1]):int(ind[j2])],np.shape(weights))
            biases=np.reshape(vector[int(ind[j2]):int(ind[j3])],np.shape(biases))
            model.layers[i].set_weights((weights,biases))
            k=k+2
    return model

#split=[15,15]
def split_data(input_data,target2,split):
    n_samples=len(target2)
    s1_input=input_data[0:int(split[0]*n_samples/(split[0]+split[1]))]
    s1_target=target2[0:int(split[0]*n_samples/(split[0]+split[1]))]
    s2_input=input_data[int(split[0]*n_samples/(split[0]+split[1])):n_samples]
    s2_target=target2[int(split[0]*n_samples/(split[0]+split[1])):n_samples]
    return s1_input, s1_target, s2_input, s2_target

# obtain gradients
def get_gradients(model,inputv,output):
  grads = model.optimizer.get_gradients(model.total_loss, model.trainable_weights)
  symb_inputs = (model._feed_inputs + model._feed_targets + model._feed_sample_weights)
  f = K.function(symb_inputs, grads)
  x, y, sample_weight = model._standardize_user_data(inputv, output)
  output_grad = f(x + y + sample_weight)
  return output_grad

class lmmaes(object):
    def __init__(self,model,inputv,output,n_candidates = None,sigma=1/10,function_budget=10000,function_target=None,layer_idx=None):

      self.sigma=sigma
      # converting the weights and biases to a row vector
      self.layer_idx=layer_idx
      self.y, self.ind = nw_to_vec(model,layer_idx=self.layer_idx)
      
      # number of layers to optimise
      if self.layer_idx==None:
          self.n_layers = len(model.layers)
          itr = range(self.n_layers)
      else:
          self.n_layers = len(self.layer_idx)
          itr = self.layer_idx

      # calculating number of dimensions
      self.n_dimensions=0
      for i in itr:
          if len(model.layers[i].get_weights())==2:
              weights, biases = model.layers[i].get_weights()
              self.n_dimensions=np.size(weights)+np.size(biases)+self.n_dimensions

      # number of candidate solutions generated
      self.n_candidates=n_candidates
      if self.n_candidates==None:
          self.n_candidates = 4 + np.floor(3*np.log(self.n_dimensions))

      # number of best solutions selected
      self.mu = np.floor(self.n_candidates/2)

      # weights for selected solutions
      self.w = np.empty([0,0])
      for i in range(int(self.mu)):
          self.w = np.append(self.w,np.log(self.mu+0.5)-np.log(i+1))
      sum_w = np.sum(self.w)
      self.w = self.w/sum_w


      self.mu_w = 1/(np.sum(np.square(self.w)))

      # number of evolution paths
      self.m = 4 + np.floor(3*np.log(self.n_dimensions))


      self.c_sigma = 2*self.n_candidates/self.n_dimensions
      self.const1=np.sqrt(self.mu_w*self.c_sigma*(2-self.c_sigma))

      # learning rates
      self.c_d = np.empty([0,0])
      self.c_c = np.empty([0,0])
      self.const2=np.empty([0,0])
      for i in range(int(self.m)):
          self.c_d = np.append(self.c_d,1/(self.n_dimensions*(1.5**i)))
          self.c_c = np.append(self.c_c,self.n_candidates/(self.n_dimensions*(4**i)))
          self.const2=np.append(self.const2,np.sqrt(self.mu_w*self.c_c[i]*(2-self.c_c[i])))

      self.t=0

      # length of evolution paths (exponentially fading record of recent most successful steps)
      #self.p_sigma=np.empty([0,0])
      #output_grad=get_gradients(model,inputv,output)
      #for i in range(len(output_grad)):
      #  self.p_sigma=np.append(self.p_sigma,output_grad[i].reshape(np.size(output_grad[i]),))
      self.p_sigma = np.zeros((self.n_dimensions,))

      # vectors modelling deviation of transformation matrix from identity matrix
      self.m_i = np.zeros((int(self.m), self.n_dimensions))
    
    def train_on_batch(self,model,train_data,divide_data=False):
      self.func_calls=0
      #t, y, p_sigma, sigma = lm_func.step_lmmaes(t,n_dimensions,m,n_candidates,c_c,c_d,loss_func,y,sigma,mu,w,c_sigma,const1,const2,data)
      z=np.random.randn(int(self.n_candidates),self.n_dimensions)
      d = np.copy(z)
      f_list=np.empty((int(self.m),1))
      for i in range(int(self.m)):
          for j in range(np.minimum(self.t,int(self.m))):
              d[i]=(1-self.c_d[j])*d[i] + self.c_d[j]*np.sum(self.m_i[j]**2)*d[i]
          
          model=vec_to_nw(self.y+self.sigma*d[i],self.ind,model,layer_idx=self.layer_idx)

          train_input=train_data[0]
          train_target=train_data[1]
          num_samples=len(train_target)-1
          if divide_data==False:
            res=model.evaluate(x=train_input,y=train_target,verbose=0)
          else:
            res=model.evaluate(x=train_input[int(i*num_samples/self.m):int((i+1)*num_samples/self.m)],y=train_target[int(i*num_samples/self.m):int((i+1)*num_samples/self.m)],verbose=0)
          self.func_calls=self.func_calls+1
          f_list[i][0] = res[0]

      sortidx_f = f_list.argsort(axis=0)

      sortidx_f = sortidx_f[0:int(self.mu)]
      best_list = np.empty([int(self.mu),self.n_dimensions])
      j = 0
      for i in sortidx_f:
          best_list[j] = self.w[j]*d[i]
          j = j+1
      y_next = self.y + self.sigma*np.sum(best_list,0)

      best_list2 = np.empty([int(self.mu),self.n_dimensions])
      j = 0
      for i in sortidx_f:
          best_list2[j] = self.w[j]*z[i]
          j = j+1
      
      p_sigma_add=np.empty([0,0])
      output_grad=get_gradients(model,train_input,train_target)
      for i in range(len(output_grad)):
        p_sigma_add=np.append(p_sigma_add,output_grad[i].reshape(np.size(output_grad[i]),))
      
      p_sigma_next = (1-self.c_sigma)*self.p_sigma + p_sigma_add
      mag_p_sigma_next = np.linalg.norm(p_sigma_next)
      for i in range(int(self.m)):
          self.m_i[i] = (1-self.c_c[i])*self.m_i[i] + self.const2[i]*np.sum(best_list2,0)
      sigma_next = self.sigma*np.exp(self.c_sigma*(((mag_p_sigma_next**2)/self.n_dimensions)-1)/2)
      self.t=self.t+1
      self.sigma = sigma_next
      self.p_sigma = p_sigma_next
      self.y = y_next
      model=vec_to_nw(self.y,self.ind,model,layer_idx=self.layer_idx)
      jhlk=sortidx_f[0][0]
      print(f_list[int(jhlk)],self.t,self.sigma)
      return model, self.func_calls, self.sigma

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten

from keras.datasets import mnist
#download mnist data and split into train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train=X_train.reshape(60000,28,28,1)
X_test=X_test.reshape(10000,28,28,1)

from keras.utils import to_categorical
#one-hot encode target column
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [0]:
model = Sequential()
#add model layers
model.add(Conv2D(64, kernel_size=3, activation='relu', input_shape=(28,28,1)))
model.add(Conv2D(32, kernel_size=3, activation='relu'))
model.add(Flatten())
model.add(Dense(10, activation='softmax'))
#compile model using accuracy to measure model performance
model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])

xtrain,ytrain,xval,yval=split_data(X_train,y_train,[5,1])

In [11]:
import numpy as np
lm=lmmaes(model,xtrain,ytrain,n_candidates = None,sigma=1/100,layer_idx=None)
print(lm.n_candidates)
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]


res_current_best=model.evaluate(x=xval,y=yval)
y_current_best=nw_to_vec(model,layer_idx=None)
validation_fail=0

for i in range(1):
  # current best solution
  
  #xtrain,ytrain=unison_shuffled_copies(xtrain,ytrain)
  batch=int(1000)
  print(batch/lm.n_candidates)
  no_samples=len(ytrain)-1
  n_groups=int(no_samples/batch)
  
  for j in range(n_groups):
    # 1 step of optimisation
    model, func_calls, sigma = lm.train_on_batch(model,train_data=(xtrain[int(j*no_samples/n_groups):int((j+1)*no_samples/n_groups)],ytrain[int(j*no_samples/n_groups):int((j+1)*no_samples/n_groups)]),divide_data=False)
  
  # new solution
  y_new,ind=nw_to_vec(model,layer_idx=None)
  res_new=model.evaluate(x=xval,y=yval)
  print(res_new[1],res_current_best[1])
  
  # validation
  if res_new[1]>res_current_best[1]:
    res_current_best=res_new
    y_current_best=y_new
    validation_fail=0
  else:
    model=vec_to_nw(y_current_best,ind,model,layer_idx=None)
    validation_fail=validation_fail+1
  
  if validation_fail>=6:
    break

40.0
25.0
[12.72743938] 1 0.009998099598372919
[12.50035376] 2 0.009996236355074671
[12.25078573] 3 0.009994505718886255
[11.61682838] 4 0.009992971577433273
[11.09501905] 5 0.009991566839590986
[11.12395012] 6 0.00999051448410141
[10.19801176] 7 0.009989514364503628
[10.30637033] 8 0.00998867369346971
[10.23756462] 9 0.009987960203197898
[9.67799238] 10 0.009987603274644213
[9.34611914] 11 0.009987715524446513
[9.5652868] 12 0.009988106759716302
[9.6931947] 13 0.009988717317471068
[9.49166686] 14 0.009989632305654621
[8.48720883] 15 0.009990648672872122
[7.96050838] 16 0.009991730357617032
[8.30940911] 17 0.009992948434391693
[8.06754724] 18 0.00999450790884119
[7.50375308] 19 0.009996426911244648
[7.69662439] 20 0.009998687266917616
[7.33677328] 21 0.010001299288173037
[6.32226282] 22 0.010004320904383097
[7.21495565] 23 0.01000772425631196
[6.90141398] 24 0.010011713012728138
[6.61898367] 25 0.01001621728519207
[6.4724273] 26 0.010021238225234826
[6.67009572] 27 0.01002657615453147
