##### Based on https://colab.research.google.com/github/turing-club/info/blob/master/BipedalWalker_COGS_workshop.ipynb#scrollTo=DgzIRF71VxNm

In [None]:
import numpy as np
import gym
import time
from matplotlib import animation, pyplot as plt

In [None]:
# Normalizes the inputs
class Normalizer():
    def __init__(self, nb_inputs):
        self.n = np.zeros(nb_inputs)
        self.mean = np.zeros(nb_inputs)
        self.mean_diff = np.zeros(nb_inputs)
        self.var = np.zeros(nb_inputs)

    def observe(self, x):
        self.n += 1.0
        last_mean = self.mean.copy()
        self.mean += (x - self.mean) / self.n
        self.mean_diff += (x - last_mean) * (x - self.mean)
        self.var = (self.mean_diff / self.n).clip(min = 1e-2)

    def normalize(self, inputs):
        obs_mean = self.mean
        obs_std = np.sqrt(self.var)
        return (inputs - obs_mean) / obs_std

In [None]:
np.random.seed(42)
env = gym.make("BipedalWalker-v3")
anim_check = False # turn on for saving progress
episode_length = 1000
nb_steps = 300
noise = 0.1
learning_rate = 0.2
num_deltas = 16
input_size = env.observation_space.shape[0]
output_size = env.action_space.shape[0]
hidden_size = 4
normalizer =  Normalizer(input_size)

hidden_layer = np.zeros((hidden_size, input_size))
output_layer = np.zeros((output_size, hidden_size))
track_record = np.zeros(nb_steps + 1)

In [None]:
# Explore the policy with a given model over one episode
def explore(hidden_layer, output_layer):
    state = env.reset()
    done = False
    num_plays = 0
    sum_rewards = 0.0
    while not done and num_plays < episode_length:
        normalizer.observe(state)
        action = output_layer @ (hidden_layer @ normalizer.normalize(state))
        state, reward, done, _ = env.step(action)
        reward = max(min(reward, 1), -1)
        sum_rewards += reward
        num_plays += 1
    return sum_rewards

In [None]:
init_time = time.time()
for step in range(nb_steps + 1):
    # initialize the random noise deltas and the positive/negative rewards
    positive_rewards, negative_rewards = np.zeros((2, num_deltas))
    deltas_hidden = np.random.randn(num_deltas, *hidden_layer.shape)
    deltas_output = np.random.randn(num_deltas, *output_layer.shape)

    # play an episode each with positive deltas and negative deltas, collect rewards
    for k, (delta_hidden, delta_output) in enumerate(zip(deltas_hidden, deltas_output)):
        hidden_positive = hidden_layer + delta_hidden * noise
        hidden_negative = hidden_layer - delta_hidden * noise
        output_positive = output_layer + delta_output * noise
        output_negative = output_layer - delta_output * noise
        positive_rewards[k] = explore(hidden_positive, output_positive)
        negative_rewards[k] = explore(hidden_negative, output_negative)

    # Compute the standard deviation of all rewards
    sigma_rewards = np.array(positive_rewards + negative_rewards).std()

    # compute the advantage of every addition
    diffs = positive_rewards - negative_rewards

    # Play an episode with the new weights
    track_record[step] = explore(hidden_layer, output_layer)

    # Update the policy
    hidden_derivatives = np.sum((diffs[:, None, None] * deltas_hidden), 0) + np.random.random(hidden_layer.shape) * 0.02
    hidden_layer += learning_rate / (num_deltas * sigma_rewards) * hidden_derivatives
    output_derivatives = np.sum((diffs[:, None, None] * deltas_output), 0) + np.random.random(output_layer.shape) * 0.02
    output_layer += learning_rate / (num_deltas * sigma_rewards) * output_derivatives

    # and print the score
    print('Step: ', step, 'Reward: ', track_record[step])

    if step % 50 == 0:
        state = env.reset()
        for i in range(500):
            env.render(mode='rgb_array')
            state, _, done, _ = env.step(output_layer @ (hidden_layer @ normalizer.normalize(state)))
            if done:
                break
        env.close()

KeyboardInterrupt: 

In [11]:
# display the robots' walking pattern
state = env.reset()
for i in range(500):
    env.render(mode='rgb_array')
    state, _, done, _ = env.step(output_layer @ (hidden_layer @ normalizer.normalize(state)))
    if done:
        break
env.close()

In [12]:
time_diff = time.time() - init_time
print(time_diff)

1483.9563500881195
