# NES implementations on OpenAI gym environments using different ANN frameworks

## Load environment and model

In [3]:
import time
import copy
import numpy as np
import gym

import sys
sys.path.append('../')
from utils import create_agent 
from simulate import *

In [4]:
environments = {'CartPole-v1':                  [4,1],
                'MountainCar-v0':               [2,8,3],
                'Pendulum-v0':                  [3,32,16,1],
                'Acrobot-v1':                   [6,32,16,1],
                'gym_custom:CartPoleSwingUp-v0':[1]
                }
envName = list(environments)[0] #env.unwrapped.spec.id
print(envName)
env = gym.make(envName)
layers = environments[envName]
framework = 'numpy'
model = create_agent(framework, envName, layers)

STD = 0.35
ALPHA = 0.2
EPOCHS = 300
NPOP = 30
DECAY = 0.99
STOPCOND = 2500
RUNS_PER_INDIVIDUAL = 5

CartPole-v1


# Using numpy

In [None]:
np.random.seed()
mu = np.random.randn(model.params_length)  # population mean weights
muLen = len(mu)
mu_fitness = -9999
muBest = None

In [None]:
print(f'Running numpy NES for {EPOCHS} epochs on environment {environments[e]}\n')
start = time.perf_counter()
for t in range(EPOCHS):
    sample = np.random.normal(mu, STD, (NPOP, muLen) )
    fitnesses = simulate_batch(env, model, sample)
    scaled_fitnesses = (fitnesses - fitnesses.mean()) / fitnesses.std()
    mu += ALPHA/(NPOP*STD) * np.dot(sample.T, scaled_fitnesses)
    ALPHA *= DECAY
    if mu_fitness < fitnesses.max():
        mu_fitness = fitnesses.max()
        muBest = copy.deepcopy(sample[np.argmax(fitnesses)])
        print(f" New best fitness: {mu_fitness}")
    if t % 10 == 0:
        print(f"epoch: {t:3.0f} | mean fitness: {fitnesses.mean():3.1f} | best fitness: {fitnesses.max():3.1f} | learning rate: {ALPHA:.4f}")
    if fitnesses.max() >= STOPCOND:
        print(f'Best fitness above {STOPCOND}')
        break
print('Done')
model.set_weights(muBest)

In [None]:
render(env,model)
env.close()

# Using evostra

In [None]:
from controllers.evostra import *

def get_reward(weights):
    global model, env
    model.set_weights(weights)
    return simulate_single(env, model)

In [None]:
RUNS = 2000
es = EvolutionStrategy(model.get_weights(), get_reward, population_size=NPOP, sigma=STD, learning_rate=0.03, decay=DECAY, num_threads=8)
print(f'Running evostra NES for {RUNS} epochs on environment {environments[e]}\n')
es.run(RUNS, print_step=100)

In [None]:
optimized_weights = es.get_weights()
model.set_weights(optimized_weights)

In [None]:
render(env,model)
env.close()

# Using Torch
## EvoGrad

In [None]:
import torch
from evograd import expectation
from evograd.distributions import Normal

In [None]:
mu = torch.randn(len(model.get_weights()), requires_grad=True)  # population mean
p = Normal(mu, STD)

In [None]:
print(f'Running EvoGrad NES for {EPOCHS} epochs on environment {environments[e]}\n')
for t in range(EPOCHS):
    sample = p.sample(NPOP)
    fitnesses = torch.tensor(simulate_batch(env, model, sample))
    scaled_fitnesses = (fitnesses - fitnesses.mean()) / fitnesses.std()
    mean = expectation(scaled_fitnesses, sample, p=p)
    mean.backward()
    with torch.no_grad():
        mu += ALPHA * mu.grad
        mu.grad.zero_()
    ALPHA *= DECAY
    print(f"epoch: {t} | mean fitness: {fitnesses.mean():0.5} | learning rate: {ALPHA:.4}")
model.set_weights(mu)

In [None]:
render(env,model)
env.close()

## Pure Torch

In [None]:
import torch

In [None]:
torch.random.seed()
fitnesses = torch.zeros(NPOP)
w = model.get_weights()
rbest = 0
w_best = None

print(f'Running torch NES for {EPOCHS} epochs on environment {environments[e]}\n')
for i in range(EPOCHS):
	for j in range(NPOP):
		N = torch.randn(NPOP,len(w))
		w_try = w + STD * N[j] 						# jitter w using gaussian of STD
		model.set_weights(w_try)
		fitnesses[j] = simulate_single(env,model)	
		if fitnesses[j].item() > rbest:
			if simulate_single(env,model) > rbest: 	# double check with different initial condition
				rbest = fitnesses[j].item()
				w_best = copy.deepcopy(w_try)
	scaled_fitnesses = (fitnesses - torch.mean(fitnesses)) / (torch.std(fitnesses)+1e-5) 		# standardize rewards to have gaussian distribution
	w += ALPHA/(NPOP*STD) * torch.matmul(N.T, scaled_fitnesses)
	ALPHA *= DECAY
	print(f'epoch {i} | best: {rbest:.2f} | current best: {torch.max(fitnesses).item():.1f} | STD: {STD:.2f} | ALPHA: {ALPHA:.3f}')
model.set_weights(w_best)

In [None]:
render(env,model)
env.close()

# Save weights

In [None]:
import pickle
envName = env.unwrapped.spec.id
file = open(f"{envName}_{framework}_weights.pkl", 'rb')
pickle.dump(model.weights, file)