In [0]:
import argparse
import gym
import numpy as np
from itertools import count

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import time

In [0]:
# TAKEN FROM https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#forceEdit=true&sandboxMode=true&scrollTo=8A-1LTSH88EE
#remove " > /dev/null 2>&1" to see what is going on under the hood
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [0]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1

Requirement already up-to-date: setuptools in /usr/local/lib/python3.6/dist-packages (46.1.3)


In [0]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay


In [0]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!


<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [0]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [0]:
parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
                    help='discount factor (default: 0.99)')
parser.add_argument('--seed', type=int, default=543, metavar='N',
                    help='random seed (default: 543)')
parser.add_argument('--render', action='store_true',
                    help='render the environment')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                    help='interval between training status logs (default: 10)')
args, unknown = parser.parse_known_args()


# Let's create a OpenAI Gym Environment

In [0]:
env = wrap_env(gym.make('CartPole-v0'))
env.reset()
action = env.action_space.sample()
action
observation, reward, done, info = env.step(action) 
observation


array([-0.02686649, -0.23127172,  0.01375064,  0.2740873 ])

In [0]:
# this environment has env.reset() and end.step() functions
env = wrap_env(gym.make('CartPole-v0'))
env.seed(47)
torch.manual_seed(47)
env.reset()
while True:  
    env.render()    
    #your agent goes here
    action = env.action_space.sample()          
    observation, reward, done, info = env.step(action) 
    if done: 
      break;
env.close()            
show_video()

# Create actor network

In [0]:
class Actor(nn.Module):
    # this class defines a policy network with two layer NN
    def __init__(self):
        super(Actor, self).__init__()
        self.affine1 = nn.Linear(4, 128)
        self.affine2 = nn.Linear(128, 2)

    def forward(self, x):
        x = F.relu(self.affine1(x))
        action_scores = self.affine2(x)
        prob = F.softmax(action_scores, dim=1)
        return prob

class Critic(nn.Module):
    # this class defines a policy network with two layer NN
    def __init__(self):
        super(Critic, self).__init__()
        self.affine1 = nn.Linear(4, 128)
        self.affine2 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.affine1(x))
        v = self.affine2(x).squeeze()
        return v

In [0]:
def select_action(state):
    # this function selects stochastic actions based on the policy probabilities
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = actor(state)
    m = Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)    
    return action.item(), log_prob
    
def rollout():
    states = []
    rewards = []
    log_probs = []
    
    # play an episode
    state = env.reset()
    while True:  # Don't infinite loop while learning
        # select an action
        action, log_prob = select_action(state)
        states.append(list(state))
        log_probs.append(log_prob[0])
        
        # take the action and move to next state
        state, reward, done, _ = env.step(action)
        rewards.append(reward)
        if done:
            break
            
    return states, rewards, log_probs    

In [0]:
def train(states,rewards,log_probs):
    

    R = np.sum(rewards)
    log_probs_paths = torch.sum(torch.cat([lp.unsqueeze(0) for lp in log_probs]))
    

    # value = critic(torch.tensor(states[0]))

    # take a backward step for actor
    actor_loss = - log_probs_paths*(R)
    # actor_loss = - log_probs_paths*(R-value.detach())
    actor_optim.zero_grad()
    actor_loss.backward()
    actor_optim.step()

    # take a backward step for critic
    # critic_loss = (value-R)**2
    # critic_optim.zero_grad()
    # critic_loss.backward()
    # critic_optim.step()


In [0]:
env.seed(347)
torch.manual_seed(347)

# create actor and critic network
actor = Actor()
critic = Critic()

# create optimizers
actor_optim = optim.Adam(actor.parameters(), lr=1e-2)
critic_optim = optim.Adam(critic.parameters(), lr=1e-2)


running_reward = 10
for i_episode in range(1000):
    states, rewards, log_probs = rollout()
    t = len(rewards)
    running_reward = running_reward * 0.9 +  t * 0.1
    train(states, rewards, log_probs)
    if i_episode % args.log_interval == 0:
        print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(
            i_episode, t, running_reward))
    if running_reward > env.spec.reward_threshold:
        print("Solved! Running reward is now {} and "
              "the last episode runs to {} time steps!".format(running_reward, t))
        break

Episode 0	Last length:    14	Average length: 10.40
Episode 10	Last length:    14	Average length: 21.08
Episode 20	Last length:    28	Average length: 21.69
Episode 30	Last length:    15	Average length: 19.59
Episode 40	Last length:    17	Average length: 17.73
Episode 50	Last length:    11	Average length: 16.97
Episode 60	Last length:    25	Average length: 21.27
Episode 70	Last length:    21	Average length: 22.48
Episode 80	Last length:    15	Average length: 23.35
Episode 90	Last length:    15	Average length: 18.42
Episode 100	Last length:    14	Average length: 18.94
Episode 110	Last length:    17	Average length: 18.43
Episode 120	Last length:    42	Average length: 30.50
Episode 130	Last length:    52	Average length: 38.98
Episode 140	Last length:    56	Average length: 52.03
Episode 150	Last length:    50	Average length: 47.93
Episode 160	Last length:    30	Average length: 46.52
Episode 170	Last length:    35	Average length: 45.93
Episode 180	Last length:    42	Average length: 41.54
Epis

# Using Causality Trick

In [0]:
def train(states,rewards,log_probs):
    
    R = 0
    P = 0
    rewards_path = []
    log_probs_paths = []
    for i in reversed(range(len(rewards))):
        R = rewards[i] + args.gamma * R
        rewards_path.insert(0, R) 
        
        P = log_probs[i]  
        log_probs_paths.insert(0, P) 

    rewards_path = torch.tensor(rewards_path)
    rewards_path = (rewards_path - rewards_path.mean()) / (rewards_path.std() + 1e-8)   #### ADDITIONAL TUNING :)
    log_probs_paths = torch.stack(log_probs_paths)
    
#     print(rewards_path,log_probs_paths)
    value = critic(torch.tensor(states))

    # take a backward step for actor
    actor_loss = -torch.mean(((rewards_path - value.detach()) * log_probs_paths))
    actor_optim.zero_grad()
    actor_loss.backward()
    actor_optim.step()

    # take a backward step for critic
    loss_fn = torch.nn.MSELoss()
    critic_loss = loss_fn(value,rewards_path)
    critic_optim.zero_grad()
    critic_loss.backward()
    critic_optim.step()


In [0]:
env.seed(347)
torch.manual_seed(347)
# create actor and critic network
actor = Actor() 
critic = Critic()

# create optimizers
actor_optim = optim.Adam(actor.parameters(), lr=1e-2)
critic_optim = optim.Adam(critic.parameters(), lr=1e-2)



running_reward = 10
for i_episode in range(1000):
    states, rewards, log_probs = rollout()
    t = len(rewards)
    running_reward = running_reward * 0.9 +  t * 0.1
    train(states, rewards, log_probs)
    if i_episode % args.log_interval == 0:
        print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(
            i_episode, t, running_reward))
    if running_reward > env.spec.reward_threshold:
        print("Solved! Running reward is now {} and "
              "the last episode runs to {} time steps!".format(running_reward, t))
        break

Episode 0	Last length:    14	Average length: 10.40
Episode 10	Last length:    40	Average length: 28.51
Episode 20	Last length:    20	Average length: 38.09
Episode 30	Last length:    29	Average length: 44.47
Episode 40	Last length:    96	Average length: 110.93
Episode 50	Last length:   200	Average length: 137.88
Episode 60	Last length:   200	Average length: 168.94
Episode 70	Last length:   200	Average length: 189.17
Solved! Running reward is now 195.3375189744868 and the last episode runs to 200 time steps!


In [0]:
env = wrap_env(gym.make('CartPole-v0'))
state = env.reset()
it = 0
while True:  # Don't infinite loop while learning
    # select an action
    action, log_prob = select_action(state)
    states.append(list(state))
    log_probs.append(log_prob[0])
    
    # take the action and move to next state
    state, reward, done, _ = env.step(action)
    rewards.append(reward)
    env.render()
    it+=1
    if done:
        break

print(it)
env.close()            
show_video()

200


## More interesting games...


https://gym.openai.com/envs/#atari

In [0]:
# env = wrap_env(gym.make("MsPacman-v0"))
# env = wrap_env(gym.make("Pong-v0"))
# env = wrap_env(gym.make("Bowling-v0"))
env = wrap_env(gym.make("Breakout-v0"))
print(env.action_space)

Discrete(4)


In [0]:
observation = env.reset()
observation.shape

(210, 160, 3)

In [0]:

while True:
  
    env.render()
    
    #your agent goes here
    action = env.action_space.sample() 
         
    observation, reward, done, info = env.step(action) 
   
        
    if done: 
      break;
            
env.close()
show_video()

In [0]:
# S = [[s_t,s_{t_1},s_{t-2}  ]
# S_0 = [s_0, s_0, s_0]     