In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
from torch import optim

import numpy as np

import gym
from gym.wrappers import Monitor
from pprint import pprint
# from pyvirtualdisplay import Display
# from IPython import display as ipythondisplay
# from IPython.display import clear_output
# import base64

from pathlib import Path

In [2]:
import utils

## Setup the environment
See https://github.com/openai/gym/wiki/Leaderboard for description of OpenAIGym environments

Discrete control settings : 
* "Taxi-v3" : episodic & discrete setting both for states and actions (S/N/E/W/Pickup/Dropoff)
* __"CartPole-v0" / "CartPole-v1" : discrete action and continuous state space__
* __"MountainCar-v0" : discrete action and continuous state space__

Continous control settings :
* __"MountainCarContinuous-v0" (both state and action continuous spaces)__
* "LunarLanderContinuous-v2" (requires box2D)
* "CarRacing-v0" : "Discrete control is reasonable in this environment as well" (requires box2D)

* "MsPacman-v0 "(requires the Atari dependency)

Settings used by the authors (requires the MuJoCo dependencies, 30 days free trial) : https://github.com/openai/mujoco-py
* "HalfCheetah-v1" (use v2 now?)
* "Hopper-v1"
* etc.

## Configuration 

In [264]:
config = {}
# config['env'] = 'CartPole-v1'
# config['env'] = 'MountainCarContinuous-v0'
config['env'] = 'MountainCar-v0'

config['gamma'] = 0.99 #Discount rate
config['value_network'] = {'lr': 1e-3}
config['actor_network'] = {'lr': 1e-3}
# config['eps_clipping'] = 0.2
# config['d_targ'] = 0.01
# config['beta_KL'] = 3
# config['entropy'] = 1e-3
# config['epochs'] = 10
# config['batch_size'] = 10

config['seed'] = 1

print("Training config : \n", config)

Training config : 
 {'env': 'MountainCar-v0', 'gamma': 0.99, 'value_network': {'lr': 0.001}, 'actor_network': {'lr': 0.001}, 'seed': 1}


In [263]:
## Define environment 
env = gym.make(config['env'])

## Save episode
# env = Monitor(env, "./gym-results", force=True)

## Action and state spaces
print("State Space : {}".format(env.observation_space))
if 'is_bounded' in dir(env.observation_space):
    print("continuous state space") 
    print("Lower bound : ", env.observation_space.low)
    print("Upper bound : ", env.observation_space.high)
else :
    print("discrete state space")

print("\nAction Space : {}".format(env.action_space))
if 'is_bounded' in dir(env.action_space):
    print("continuous action space") 
    print("Lower bound : ", env.action_space.low)
    print("Uppder bound : ", env.action_space.high)
else :
    print("discrete action space")

print("\nReward range", env.reward_range)
    
## Reset the environment
env.reset()
print("\nInitial state : ",env.state)
env = env.env
# env.render()

## Sample randomly one action
action = env.action_space.sample()
print("Sampled action : " , action)
try :
    state, reward, done, info = env.step(a = action)
except :
    state, reward, done, info = env.step(action = action)
# env.render()
print("\nEpisode over : ",done)

State Space : Box(2,)
continuous state space
Lower bound :  [-1.2  -0.07]
Upper bound :  [0.6  0.07]

Action Space : Discrete(3)
discrete action space

Reward range (-inf, inf)

Initial state :  [-0.55824552  0.        ]
Sampled action :  1

Episode over :  False


## Actor and critic networks 


Questions : __/!\ not defined precisely in the paper ??__

NB : no parameter sharing between policy and value function

In [252]:
class CustomValueNetwork(nn.Module):
    """
    Approximation of the value function V of a state given as input
    FC network with 1 hidden layer and ReLU activations
    Class used as 'critic'
    Inputs : 
    input_size : dimension of each state
    hidden_size : number of hidden layers
    output_size : 1 (dimension of the value function estimate)
    """

    def __init__(self, input_size, hidden_size, output_size = 1):
        super(CustomValueNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out = F.relu(self.fc1(x.float()))
        out = F.relu(self.fc2(out))
        out = self.fc3(out)
        return out
    
    def predict(self, x):
        return self(x).detach().numpy()[0]

In [253]:
# test 
critic = CustomValueNetwork(env.observation_space.shape[0], 16, 1)
critic(torch.tensor(env.state))

tensor([-0.0777], grad_fn=<AddBackward0>)

In [257]:
class CustomDiscreteActorNetwork(nn.Module):
    """
    Custom policy model network for discrete action space
    Inputs : 
    input_size : state space dimension
    hidden_size : nb of hidden layers (64 in author's paper for continous action space)
    action_size : action space dimension
    """
    def __init__(self, input_size, hidden_size, action_size):
        super(CustomDiscreteActorNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        out = torch.tanh(self.fc1(x))
        out = torch.tanh(self.fc2(out))
        out = torch.tanh(self.fc2(out))
        out = F.softmax(self.fc3(out), dim=-1)
        return out
    
    def select_action(self, x):
        return torch.multinomial(self(x), 1).detach().numpy()

__Questions :__
* gaussian distribution for bounded actions ? 
* set std parameter 

In [255]:
class ContinuousActorNetwork(nn.Module):
    """
    Policy model network for continuous action space (from the paper)
    Inputs : 
    input_size : state space dimension
    hidden_size : nb of hidden layers used by the authors
    action_size : action space dimension
    """
    def __init__(self, input_size, hidden_size, action_size):
        super(ContinuousActorNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_size)
        self.std = torch.tensor(np.exp(-0.5 * np.ones(action_size))).float() #???
        
    def forward(self, x):
        out = torch.tanh(self.fc1(x.float()))
        out = torch.tanh(self.fc2(out))
        out = torch.tanh(self.fc2(out))
        out = torch.tanh(self.fc3(out))
        return out
    
    def select_action(self, x):
        return torch.normal(self(x), self.std).detach().numpy()

In [259]:
# test
if 'n' in dir(env.action_space) : #check if we are in a discrete action space
    actor = CustomDiscreteActorNetwork(env.observation_space.shape[0], 64, env.action_space.n)
else :
    actor = ContinuousActorNetwork(env.observation_space.shape[0], 64, env.action_space.shape[0])

print("Mean : ", actor(torch.tensor(env.state)))
print("Sampled action : ", actor.select_action(torch.tensor(env.state)))

Mean :  tensor([0.3051, 0.3573, 0.3377], grad_fn=<SoftmaxBackward>)
Sampled action :  [1]
