In [3]:
import gym
import numpy as np

ENV = gym.make("Acrobot-v1")

NUM_FEATURES = 6
LAMBDA       = 10
SIGMA        = 0.01
NUM_TRIALS   = 1

def sample_population(mu):
    return mu + SIGMA * np.random.randn(LAMBDA, NUM_FEATURES)

def sample_action(sample, observation):
    y = np.dot(observation, sample)
    if y <  0: return -1
    if y <  1: return  0
    if y >= 1: return  1

def evaluate(sample):
    total_rewards = np.zeros(NUM_TRIALS)
    for trial in range(NUM_TRIALS):
        total_reward = 0.0
        observation = ENV.reset()
        for step in range(500):
            action = sample_action(sample, observation)
            observation, reward, done, _ = ENV.step(action)
            total_reward += reward
            if done: break
        total_rewards[trial] = total_reward
    return np.mean(total_rewards)

def update_mu(population):
    return population[0] # TODO, but this works fine for cartpole

def play(mu):
    observation = ENV.reset()
    for step in range(500):
        ENV.render()
        action = sample_action(mu, observation)
        observation, _, done, _ = ENV.step(action)
        if done:
            print("Episode finished after {} timesteps".format(step + 1))
            break

In [4]:
mu = np.zeros(NUM_FEATURES)

for episode in range(50):
    population = sample_population(mu)
    rewards = map(evaluate, population)
    rewards = np.fromiter(rewards, float)
    sort_idx = np.argsort(rewards)
    rewards = rewards[sort_idx][::-1]
    population = population[sort_idx][::-1]
    mu = update_mu(population)
    if (episode + 1) % 5 == 0:
        print("episode: {}, reward: {}".format(episode + 1, rewards))
print(mu)
print("rewards: {}".format(rewards))

episode: 5, reward: [ -67.  -72.  -74.  -78.  -79.  -89.  -92.  -93. -500. -500.]
episode: 10, reward: [ -65.  -65.  -73.  -74.  -77.  -88. -104. -500. -500. -500.]
episode: 15, reward: [ -76.  -77.  -86.  -89.  -91.  -96.  -97.  -98. -500. -500.]
episode: 20, reward: [ -65.  -84.  -85.  -88.  -88.  -96. -105. -120. -132. -500.]
episode: 25, reward: [ -77.  -78.  -79.  -79.  -82. -100. -114. -500. -500. -500.]
episode: 30, reward: [ -65.  -65.  -65.  -66.  -67.  -78.  -82.  -90.  -91. -500.]
episode: 35, reward: [ -72.  -74.  -85.  -86.  -87.  -87.  -88. -112. -500. -500.]
episode: 40, reward: [-72. -73. -74. -77. -78. -81. -85. -86. -92. -98.]
episode: 45, reward: [ -64.  -64.  -65.  -72.  -79.  -81.  -94.  -94. -101. -122.]
episode: 50, reward: [ -72.  -84.  -84.  -85.  -88.  -99. -102. -108. -143. -500.]
[ 0.14822704 -0.03093725 -0.11040434  0.00468852  0.04022745 -0.09917339]
rewards: [ -72.  -84.  -84.  -85.  -88.  -99. -102. -108. -143. -500.]


In [5]:
play(mu)

Episode finished after 79 timesteps
