In [1]:
#remove " > /dev/null 2>&1" to see what is going on under the hood
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[all] > /dev/null 2>&1

!pip install Box2D
!pip install box2d

Requirement already up-to-date: setuptools in /usr/local/lib/python3.6/dist-packages (46.1.3)
Collecting Box2D
[?25l  Downloading https://files.pythonhosted.org/packages/a9/0b/d48d42dd9e19ce83a3fb4eee074e785b6c6ea612a2244dc2ef69427d338b/Box2D-2.3.10-cp36-cp36m-manylinux1_x86_64.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 9.2MB/s 
[?25hInstalling collected packages: Box2D
Successfully installed Box2D-2.3.10


In [0]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from collections import namedtuple
from itertools import count
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T  

In [2]:
is_ipython = 'inline' in matplotlib.get_backend()
# if is_python: from IPython import display

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!


<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1007'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1007'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [0]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

The “Critic” estimates the value function. This could be the action-value (the Q value) or state-value (the V value).

The “Actor” updates the policy distribution in the direction suggested by the Critic (such as with policy gradients).

In [0]:
class ActorCriticNetwork(nn.Module):
  def __init__(self,alpha,input_dims,fc1_dims,fc2_dims,n_actions):
    super(ActorCriticNetwork,self).__init__()
    self.alpha = alpha
    self.input_dims = input_dims
    self.fc1_dims = fc1_dims
    self.fc2_dims = fc2_dims
    self.n_actions = n_actions
    self.fc1 = nn.Linear(*self.input_dims,self.fc1_dims)
    self.fc2 = nn.Linear(self.fc1_dims,self.fc2_dims)

    self.pi = nn.Linear(self.fc2_dims,n_actions)
    self.v = nn.Linear(self.fc2_dims,1)

    self.optimizer = optim.Adam(self.parameters(),lr=self.alpha)
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.to(self.device)

  def forward(self, observation):
    state = torch.Tensor(observation).to(self.device)
    x = F.relu(self.fc1(state))
    x = F.relu(self.fc2(x))
    pi = self.pi(x)
    v = self.v(x)

    return pi,v

Here the agent chooses an action on the basis of the evaluation that the network makes when presented the current states of the environment.

Thr Critic evaluates the actor's choice of action by evaluating the value function.

In [0]:
class NewAgent(object):
  def __init__(self,alpha,input_dims,gamma=0.99,layer1_size=256,layer2_size=256,n_actions=4):
    self.gamma = gamma
    self.actor_critic = ActorCriticNetwork(alpha,input_dims,layer1_size,layer2_size,n_actions=n_actions)
    self.log_probs = None

  def choose_action(self,observation):
    policy,_ = self.actor_critic.forward(observation)
    policy = F.softmax(policy)
    action_probs = torch.distributions.Categorical(policy)
    action = action_probs.sample()
    self.log_probs = action_probs.log_prob(action)

    return action.item()

  def learn(self, state, reward, state_, done):
    self.actor_critic.optimizer.zero_grad()

    _,critic_value = self.actor_critic.forward(state)
    _,critic_value_ = self.actor_critic.forward(state_)

    reward = torch.tensor(reward, dtype=torch.float).to(self.actor_critic.device)
    delta = reward + self.gamma*critic_value_*(1-int(done)) - critic_value

    actor_loss = -self.log_probs*delta #gradient ascent so it is -ve
    critic_loss = delta**2

    (actor_loss+critic_loss).backward()
    self.actor_critic.optimizer.step()


In [0]:
def plot(values,moving_avg_period):
  plt.figure(2)
  plt.clf()
  plt.title('Training..')
  plt.xlabel('Episode')
  plt.ylabel('Duration')
  plt.plot(values)
  moving_avg = get_moving_average(moving_avg_period,values)
  plt.plot(moving_avg)
  plt.pause(0.001)
  print("Episode",len(values),"\n",moving_avg_period,"episode moving avg:",moving_avg[-1])
  if is_ipython: ipythondisplay.clear_output(wait=True)

def get_moving_average(period,values):
  values = torch.tensor(values,dtype=torch.float)
  if len(values)>=period:
    moving_avg = values.unfold(dimension=0, size=period,step=1).mean(dim=1).flatten(start_dim=0)
    moving_avg = torch.cat((torch.zeros(period-1),moving_avg))
    return moving_avg.numpy()
  else:
    moving_avg = torch.zeros(len(values))
    return moving_avg.numpy()

In [0]:
agent = NewAgent(alpha=0.00001, input_dims=[8], gamma=0.99, n_actions=4,layer1_size=2048,layer2_size=512)

env = wrap_env(gym.make('LunarLander-v2').unwrapped)
score_history = []
num_episodes = 2000

for i in range(num_episodes):
  done = False
  observation = env.reset()
  score = 0
  while not done:
    action = agent.choose_action(observation)
    observation_,reward,done,info = env.step(action)
    agent.learn(observation,reward,observation_,done)
    observation = observation_
    score+=reward

  score_history.append(score)
  # print("episode",i,"score %.2f" %score)
  plot(score_history,100)

env.close()
show_video()

  if __name__ == '__main__':
