In [9]:
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 26 23:20:19 2018

@author: outline by jpmaldonado, edited by Radek Bartyzal
"""

import numpy as np

solution = np.array([1, 1, -0.5])
# the function we want to optimize for sanity check
def f(theta):
  # Here would go the evaluation of the episode
  reward = -np.sum(np.square(solution - theta))
  return reward

def vecf(thetas):
    return [f(theta) for theta in thetas]

def evaluate_and_sort(cands, f):
    cands_and_scores = []
    for cand in cands:
        cands_and_scores.append(np.array([cand,f(cand)]))
        
    cands_and_scores = np.array(cands_and_scores)
    sorted_cands_and_scores = sorted(cands_and_scores,key=lambda x: x[1])[::-1] # sort according to 2nd column and then reverse the array
    return np.array(sorted_cands_and_scores)

def get_top_cands(sorted_cands_and_scores, selection_rate):
    #print("Sorted cands and scores:\n", sorted_cands_and_scores)
    n = (int)(selection_rate * len(sorted_cands_and_scores))
    top_cands = np.copy(sorted_cands_and_scores[0:n,0])
    size = (n,len(top_cands[0]))
    #print("Wanted size =", size)
    #print("Top cands:\n", top_cands)
    top_cands = np.concatenate(top_cands)
    top_cands = np.reshape(top_cands, size)
    #print("Top cands after reshape:\n", top_cands)
    return top_cands

In [6]:
#################################
# STARTER CODE - CEM
#################################

#batch_size = 25 # number of samples per batch = population size
#elite_frac = 0.2 # fraction of samples used as elite set


def cross_entropy(n_iter = 500, batch_size = 25, elite_frac = 0.2):
    dim_theta = 3
    score_best = -999999
    theta_best = [0,0,0]
    theta_mean = np.zeros(dim_theta)
    theta_std = np.ones(dim_theta)

    for it in range(n_iter):
        # Sample parameter vectors 
        samples = np.random.multivariate_normal(theta_mean, np.diag(theta_std), batch_size)   

        # Evaluate candidates = samples
        sorted_cands_and_scores = evaluate_and_sort(samples, f)

        best_cand_score = sorted_cands_and_scores[0,1]
        if best_cand_score > score_best:
            score_best = best_cand_score
            theta_best = sorted_cands_and_scores[0,0]
            print("NEW BEST:", score_best)

        # Get elite parameters
        top_cands = get_top_cands(sorted_cands_and_scores, elite_frac)

        # Update theta_mean, theta_std
        theta_mean = np.mean(top_cands, axis=0)
        theta_std = np.std(top_cands, axis=0)

        if it % 50==0: 
            print("Generation:", it)

print("Final best score:", score_best)
print("Final best theta:", theta_best)
print("Approximating   :", solution)

cross_entropy()

Final best score: -999999
Final best theta: [0, 0, 0]
Approximating   : [ 1.   1.  -0.5]
NEW BEST: -0.16937240671351786
Generation: 0
NEW BEST: -0.11667221813561474
NEW BEST: -0.03694935148319528
NEW BEST: -0.01672875435199179
NEW BEST: -0.0029797500860808245
NEW BEST: -0.0025757236744202716
Generation: 50
NEW BEST: -0.0009809090436006293
Generation: 100
Generation: 150
Generation: 200
Generation: 250
Generation: 300
Generation: 350
NEW BEST: -0.0008852779766783092
Generation: 400
Generation: 450


In [None]:
np.random.multivariate_normal(theta_mean, np.diag(theta_std), 3)

In [11]:
#################################
# STARTER CODE - NES
#################################
dim_theta = 3
w0 = np.random.randn(3) #initial guess
npop=50
n_iter=1
sigma=0.1
alpha=0.001

theta_mean = np.zeros(dim_theta)
theta_std = np.ones(dim_theta)
score_best = -999999
theta_best = [0,0,0]

# init population
population = np.random.multivariate_normal(theta_mean, np.diag(theta_std), npop) 

for _ in range(n_iter):
    # Sample vectors from a normal distribution
    noise_samples = np.random.multivariate_normal(theta_mean, np.diag(theta_std), npop) 
    
    # Sample function values by evaluating on the population
    evals = vecf(population)
    evals_with_noise = vecf(np.copy(population) + sigma*noise_samples)
    
    best_cand_score = np.max(evals)
    if best_cand_score > score_best:
        score_best = best_cand_score
        print("NEW BEST:", score_best)
    
    # Optional: standardize (substract mean and divide by std)
    
    # "Gradient" update
    for x in population:
        x = x + alpha * (1.0/npop) * np.sum((evals_with_noise - evals) / sigma, axis=0)
    
    


NEW BEST: -0.24212926292247144


TypeError: unsupported operand type(s) for -: 'list' and 'list'