In [1]:
import gym
import math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [2]:
 class ActorCritic(nn.Module):
    def __init__(self, env, hid_size=24):
        super(ActorCritic, self).__init__()
        self.env = env
        self.shape_size = env.observation_space.shape[0]
        self.hid_size = hid_size
        self.act_size = env.action_space.shape[0]
        self.FC_1 = nn.Linear(self.shape_size, self.hid_size)
        self.FC_2 = nn.Linear(self.hid_size, self.act_size)
        
    def forward(self, x):
        x = F.relu(self.FC_1(x))
        x = F.tanh(self.FC_2(x))
        return x.cpu().data
        
    def evaluate(self, weights, gamma=0.98, max_t=6000):
        self.set_weights(weights)
        episode_return = 0.0
        state = self.env.reset()
        for t in range(max_t):
            state = torch.from_numpy(state).float().to(device)
            action = self.forward(state)
            state, reward, done, _ = self.env.step(action)
            episode_return += reward * math.pow(gamma, t)
            if done:
                break
        return episode_return
        
    def set_weights(self, weights):
        shape_size = self.shape_size
        hid_size = self.hid_size
        act_size = self.act_size
        FC_1_end = (shape_size*hid_size)+hid_size
        FC_1_W = torch.from_numpy(weights[:shape_size*hid_size].reshape(shape_size, hid_size))
        FC_1_b = torch.from_numpy(weights[shape_size*hid_size:FC_1_end])
        FC_2_W = torch.from_numpy(weights[FC_1_end:FC_1_end+(hid_size*act_size)].reshape(hid_size, act_size))
        FC_2_b = torch.from_numpy(weights[FC_1_end+(hid_size*act_size):])
        self.FC_1.weight.data.copy_(FC_1_W.view_as(self.FC_1.weight.data))
        self.FC_1.bias.data.copy_(FC_1_b.view_as(self.FC_1.bias.data))
        self.FC_2.weight.data.copy_(FC_2_W.view_as(self.FC_2.weight.data))
        self.FC_2.bias.data.copy_(FC_2_b.view_as(self.FC_2.bias.data))
    
    def get_weights_dim(self):
        return (self.shape_size+1)*self.hid_size + (self.hid_size+1)*self.act_size

In [3]:
device = torch.device("cpu")

env = gym.make('MountainCarContinuous-v0')
env.seed(111)
np.random.seed(111)

model = ActorCritic(env).to(device)

In [4]:
def cem(max_episodes=3000, max_t=1000, gamma=0.98, print_interval=20, pp_sz=50, top=0.23, sigma=0.5):
    scores_deque = deque(maxlen=100)
    scores = []
    best_weight = sigma*np.random.randn(model.get_weights_dim())

    for n_epi in range(1, max_episodes + 1):
        weights_pop = [best_weight + (sigma*np.random.randn(model.get_weights_dim())) for i in range(pp_sz)]
        rewards = np.array([model.evaluate(weights, gamma, max_t) for weights in weights_pop])

        elite_idxs = rewards.argsort()[-pp_sz*top:]
        elite_weights = [weights_pop[i] for i in elite_idxs]
        best_weight = np.array(elite_weights).mean(axis=0)

        reward = model.evaluate(best_weight, gamma=1.0)
        scores_deque.append(reward)
        scores.append(reward)
        
        if n_epi % print_interval == 0:
            print('CartPole-v1 # of episode : {}\t, avg score : {:.2f}'.format(n_epi, np.mean(scores_deque)))
    return scores

In [5]:
scores = cem()

MountainCar-v0 # of episode : 200	, avg score : -5.85
MountainCar-v0 # of episode : 400	, avg score : -0.84
MountainCar-v0 # of episode : 600	, avg score : 2.10
MountainCar-v0 # of episode : 800	, avg score : 10.15
MountainCar-v0 # of episode : 1000	, avg score : 13.65
MountainCar-v0 # of episode : 1200	, avg score : 23.90
MountainCar-v0 # of episode : 1400	, avg score : 45.34
MountainCar-v0 # of episode : 1600	, avg score : 57.37
MountainCar-v0 # of episode : 1800	, avg score : 75.27
MountainCar-v0 # of episode : 2000	, avg score : 82.39
MountainCar-v0 # of episode : 2200	, avg score : 89.20
MountainCar-v0 # of episode : 2400	, avg score : 90.54
MountainCar-v0 # of episode : 2600	, avg score : 91.40
MountainCar-v0 # of episode : 2800	, avg score : 91.56
MountainCar-v0 # of episode : 3000	, avg score : 92.01
