In [None]:
import gym
import math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import os 
import torch
import torch.nn as neural
import torch.nn.functional as F
from torch.autograd import Variable

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
import gym
from time import sleep
       
environment = gym.make('MountainCarContinuous-v0')
environment.seed(101)
np.random.seed(101)

observation_space_shape = environment.observation_space.shape
action_space = environment.action_space
action_space_low = environment.action_space.low
action_space_high = environment.action_space.high

print("MountainCar Environment's Observation Space", observation_space_shape)
print("MountainCar Environment's Action Space", action_space)
print("MountainCar Environment's Action Space Min", action_space_low)
print("MountainCar Environment's Action Space Max", action_space_high)

MountainCar Environment's Observation Space (2,)
MountainCar Environment's Action Space Box(1,)
MountainCar Environment's Action Space Min [-1.]
MountainCar Environment's Action Space Max [1.]


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class MountainCarAgent(neural.Module):
    def __init__(self, environment, hidden_layer=16):
        super(MountainCarAgent, self).__init__()
        self.environment = environment
        # state, hidden layer, action sizes
        self.input_layer = environment.observation_space.shape[0]
        self.hidden_layer = hidden_layer
        self.output_layer = environment.action_space.shape[0]
        # define layers
        self.fullyconnected1 = neural.Linear(self.input_layer, self.hidden_layer)
        self.fullyconnected2 = neural.Linear(self.hidden_layer, self.output_layer)
        
    def assign_weight_dims(self, weights):
        input_layer = self.input_layer
        hidden_layer = self.hidden_layer
        output_layer = self.output_layer
  
        fullyconnected1_calcw = (input_layer*hidden_layer)+hidden_layer
        fullyconnected1_weight = torch.from_numpy(weights[:input_layer*hidden_layer].reshape(input_layer, hidden_layer))
        fullyconnected1_frac = torch.from_numpy(weights[input_layer*hidden_layer:fullyconnected1_calcw])
        fullyconnected2_W = torch.from_numpy(weights[fullyconnected1_calcw:fullyconnected1_calcw+(hidden_layer*output_layer)].reshape(hidden_layer, output_layer))
        fullyconnected2_b = torch.from_numpy(weights[fullyconnected1_calcw+(hidden_layer*output_layer):])

        self.fullyconnected1.weight.data.copy_(fullyconnected1_weight.view_as(self.fullyconnected1.weight.data))
        self.fullyconnected1.bias.data.copy_(fullyconnected1_frac.view_as(self.fullyconnected1.bias.data))
        self.fullyconnected2.weight.data.copy_(fullyconnected2_W.view_as(self.fullyconnected2.weight.data))
        self.fullyconnected2.bias.data.copy_(fullyconnected2_b.view_as(self.fullyconnected2.bias.data))
    
    def glean_weight_dims(self):
        return (self.input_layer+1)*self.hidden_layer + (self.hidden_layer+1)*self.output_layer
        
    def forward(self, x):
        x = F.relu(self.fullyconnected1(x))
        x = F.tanh(self.fullyconnected2(x))
        return x.cpu().data
        
    def cem_policy_evaluate(self, weights, gamma=1.0, max_t=5000):
        self.assign_weight_dims(weights)
        episodic_return = 0.0
        state = self.environment.reset()
        for t in range(max_t):
            state = torch.from_numpy(state).float().to(device)
            action = self.forward(state)
            state, reward, done, _ = self.environment.step(action)
            episodic_return += reward * math.pow(gamma, t)
            if done:
                break
        return episodic_return
    
CEMAgent = MountainCarAgent(environment).to(device)


def cem_rl_algorithm(n_iterations=500, max_t=1000, gamma=1.0, print_every=10, pop_size=50, top_frac=0.2, sigma=0.5):

    top_list=int(pop_size*top_frac)

    scores_deque = deque(maxlen=100)
    scores = []
    top_weight = sigma*np.random.randn(CEMAgent.glean_weight_dims())

    for i_iteration in range(1, n_iterations+1):
        weights_pop = [top_weight + (sigma*np.random.randn(CEMAgent.glean_weight_dims())) for i in range(pop_size)]
        rewards = np.array([CEMAgent.cem_policy_evaluate(weights, gamma, max_t) for weights in weights_pop])

        top_indices = rewards.argsort()[-top_list:]
        top_weights = [weights_pop[i] for i in top_indices]
        top_weight = np.array(top_weights).mean(axis=0)

        reward = CEMAgent.cem_policy_evaluate(top_weight, gamma=1.0)
        scores_deque.append(reward)
        scores.append(reward)
        
        torch.save(CEMAgent.state_dict(), 'checkpoint.pth')
        
        if i_iteration % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_iteration, np.mean(scores_deque)))

        if np.mean(scores_deque)>=90.0:
            print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}'.format(i_iteration-100, np.mean(scores_deque)))
            break
    return scores

scores = cem_rl_algorithm()



Episode 10	Average Score: -1.44
Episode 20	Average Score: -3.98
Episode 30	Average Score: -4.18
Episode 40	Average Score: 2.57
Episode 50	Average Score: 18.74
Episode 60	Average Score: 29.35
Episode 70	Average Score: 38.69
Episode 80	Average Score: 45.65
Episode 90	Average Score: 47.98
Episode 100	Average Score: 52.56
Episode 110	Average Score: 62.09
Episode 120	Average Score: 72.28
Episode 130	Average Score: 82.21
Episode 140	Average Score: 89.48

Environment solved in 47 iterations!	Average Score: 90.83
