<a href="https://colab.research.google.com/github/ArshT/Reinforcement_Learning_Basic/blob/master/PPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gym
!apt-get install python-opengl -y
!apt install xvfb -y

# Special gym environment
!pip install gym[atari]

# For rendering environment, you can use pyvirtualdisplay.
!pip install pyvirtualdisplay
!pip install piglet

# To activate virtual display 
# need to run a script once for training an agent as follows
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()


# This code creates a virtual display to draw game images on. 
# If you are running locally, just ignore it
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
    !bash ../xvfb start
    %env DISPLAY=:1

#
# Import libraries
#
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) # error only
import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay

"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Suggested packages:
  libgle3
The following NEW packages will be installed:
  python-opengl
0 upgraded, 1 newly installed, 0 to remove and 14 not upgraded.
Need to get 496 kB of archives.
After this operation, 5,416 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 python-opengl all 3.1.0+dfsg-1 [496 kB]
Fetched 496 kB in 0s (3,074 kB/s)
Selecting previously unselected package python-opengl.
(Reading database ... 144793 files and directories currently installed.)
Preparing to unpack .../python-opengl_3.1.0+dfsg-1_all.deb ...
Unpacking python-opengl (3.1.0+dfsg-1) ...
Setting up python-opengl (3.1.0+dfsg-1) ...
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  xvfb
0 upgraded, 1 newly installed, 0 to remove and 14 not upgraded.
Need to get 783 kB o

In [2]:
# Install spinningup on CoLab
!git clone https://github.com/openai/spinningup.git
!cd spinningup
#!pip install -e . # this will incur error: File "setup.py" not found. Directory cannot be installed in editable mode: /content
!pip install -e spinningup

Cloning into 'spinningup'...
remote: Enumerating objects: 1263, done.[K
remote: Total 1263 (delta 0), reused 0 (delta 0), pack-reused 1263[K
Receiving objects: 100% (1263/1263), 31.02 MiB | 15.54 MiB/s, done.
Resolving deltas: 100% (590/590), done.
Obtaining file:///content/spinningup
Collecting cloudpickle==1.2.1
  Downloading https://files.pythonhosted.org/packages/09/f4/4a080c349c1680a2086196fcf0286a65931708156f39568ed7051e42ff6a/cloudpickle-1.2.1-py2.py3-none-any.whl
Collecting gym[atari,box2d,classic_control]~=0.15.3
[?25l  Downloading https://files.pythonhosted.org/packages/e0/01/8771e8f914a627022296dab694092a11a7d417b6c8364f0a44a8debca734/gym-0.15.7.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 17.1MB/s 
Collecting matplotlib==3.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/57/4f/dd381ecf6c6ab9bcdaa8ea912e866dedc6e696756156d8ecc087e20817e2/matplotlib-3.1.1-cp36-cp36m-manylinux1_x86_64.whl (13.1MB)
[K     |████████████████████████████████| 

In [3]:
import torch
import torch.nn as nn
from torch.distributions import Categorical
import gym
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class ActorCritic(nn.Module):

  def __init__(self,input_dims,n_actions,fc1_dims,fc2_dims):
    super(ActorCritic, self).__init__()

    self.fc1_action = nn.Linear(input_dims,fc1_dims)
    self.fc2_action = nn.Linear(fc1_dims,fc2_dims)

    self.fc1_value = nn.Linear(input_dims,fc1_dims)
    self.fc2_value = nn.Linear(fc1_dims,fc2_dims)

    self.action_layer = nn.Linear(fc2_dims,n_actions)
    self.value_layer = nn.Linear(fc2_dims,1)

    self.to(device)
  
  def forward(self,observation):
    try:
      state = torch.from_numpy(observation).float().to(device)
    except:
      state = observation

    x = F.tanh(self.fc1_action(state))
    x = F.tanh(self.fc2_action(x))
    action_probs = self.action_layer(x)

    y = F.tanh(self.fc1_value(state))
    y = F.tanh(self.fc2_value(y))
    state_value = self.value_layer(y)

    return action_probs,state_value


class PPO(object):

  def __init__(self,input_dims,n_actions,fc1_dims,fc2_dims,lr,betas,gamma,K_epochs,eps_clip):
    self.lr = lr
    self.betas = betas
    self.gamma = gamma
    self.eps_clip = eps_clip
    self.K_epochs = K_epochs

    self.policy = ActorCritic(input_dims,n_actions,fc1_dims,fc2_dims)
    self.optimizer = torch.optim.Adam(self.policy.parameters(),lr = lr,betas=betas)
    self.policy_old = ActorCritic(input_dims,n_actions,fc1_dims,fc2_dims).to(device)
    self.policy_old.load_state_dict(self.policy.state_dict())

    self.MseLoss = nn.MSELoss()
    
    self.actions = []
    self.states = []
    self.logprobs = []
    self.rewards = []
    self.is_terminals = []

  def clear_memory(self):
    
    del self.actions[:]
    del self.states[:]
    del self.logprobs[:]
    del self.rewards[:]
    del self.is_terminals[:]

  def act(self,state):
    action_probs,_ = self.policy_old.forward(state)
    action_probs = F.softmax(action_probs)
    dist = Categorical(action_probs)
    action = dist.sample()

    state = torch.tensor(state).float().to(device)
    self.states.append(state)
    self.actions.append(action)
    self.logprobs.append(dist.log_prob(action))

    return action.item()
  
  def evaluate(self,state,action):

    action_probs,state_value = self.policy.forward(state)
    action_probs = F.softmax(action_probs)
    dist = Categorical(action_probs)
    action_logprobs = dist.log_prob(action)
    dist_entropy = dist.entropy()

    return action_logprobs, torch.squeeze(state_value), dist_entropy
  

  def update(self):

      rewards = []
      discounted_reward = 0
      for reward, is_terminal in zip(reversed(self.rewards), reversed(self.is_terminals)):
        if is_terminal:
          discounted_reward = 0
        discounted_reward = reward + (self.gamma * discounted_reward)
        rewards.insert(0, discounted_reward)
      
      rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
      rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

      old_states = torch.stack(self.states).to(device).detach()
      old_actions = torch.stack(self.actions).to(device).detach()
      old_logprobs = torch.stack(self.logprobs).to(device).detach()      

      for _ in range(self.K_epochs):

            logprobs, state_values, dist_entropy = self.evaluate(old_states, old_actions)
            
            ratios = torch.exp(logprobs - old_logprobs.detach())

            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
        
      self.policy_old.load_state_dict(self.policy.state_dict())
      self.clear_memory()

In [4]:
def main():
    ############## Hyperparameters ##############
    env_name = "LunarLander-v2"
    # creating environment
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = 4
    render = False
    solved_reward = 230         # stop training if avg_reward > solved_reward
    log_interval = 20           # print avg reward in the interval
    max_episodes = 50000        # max training episodes
    max_timesteps = 500        # max timesteps in one episode
    fc1_dims = 64
    fc2_dims = 64         # number of variables in hidden layer
    update_timestep = 2000      # update policy every n timesteps
    lr = 0.002
    betas = (0.9, 0.999)
    gamma = 0.99                # discount factor
    K_epochs = 4                # update policy for K epochs
    eps_clip = 0.2              # clip parameter for PPO
    random_seed = None
    #############################################
    
    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)
    
    ppo = PPO(state_dim, action_dim,fc1_dims,fc2_dims, lr, betas, gamma, K_epochs, eps_clip)
    print(lr,betas)
    
    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0
    
    # training loop
    for i_episode in range(1, max_episodes+1):
        state = env.reset()
        score = 0
        for t in range(max_timesteps):
            timestep += 1
            
            # Running policy_old:
            action = ppo.act(state)
            state, reward, done, _ = env.step(action)
            score += reward
            ppo.rewards.append(reward)
            ppo.is_terminals.append(done)
            
            # update if its time
            if timestep % update_timestep == 0:
                ppo.update()
                ppo.clear_memory()
                timestep = 0
            
            running_reward += reward
            if render:
                env.render()
            if done:
                break
                
        avg_length += t
        print('Episode {} \t Score: {}'.format(i_episode,score))
        
        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            break
            
        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))
            
            print('Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0
            
if __name__ == '__main__':
    main()

0.002 (0.9, 0.999)
Episode 1 	 Score: -475.7356605651851




Episode 2 	 Score: -140.35056769254385
Episode 3 	 Score: -90.74914708313892
Episode 4 	 Score: -193.4951918784476
Episode 5 	 Score: -372.2508790111144
Episode 6 	 Score: -117.39304443534395
Episode 7 	 Score: -293.1968228627816
Episode 8 	 Score: -148.6676720449912
Episode 9 	 Score: -40.43094540073869
Episode 10 	 Score: -116.02562135909241
Episode 11 	 Score: -112.46168258357555
Episode 12 	 Score: -121.13273030912292
Episode 13 	 Score: -354.8056201415535
Episode 14 	 Score: -323.6678175097352
Episode 15 	 Score: 12.102133884687944
Episode 16 	 Score: -118.39611273729581
Episode 17 	 Score: -85.35353393972426
Episode 18 	 Score: -122.59928219918349
Episode 19 	 Score: -141.68549328263776
Episode 20 	 Score: -130.6229210581168
Episode 20 	 avg length: 88 	 reward: -174
Episode 21 	 Score: -163.21181892241248




Episode 22 	 Score: -116.97699733872147
Episode 23 	 Score: -249.34750500295308
Episode 24 	 Score: -150.65575706307698
Episode 25 	 Score: -221.3340311957482
Episode 26 	 Score: -154.208109796072
Episode 27 	 Score: -182.43813906042243
Episode 28 	 Score: -355.9410288684329
Episode 29 	 Score: -113.85082379388663
Episode 30 	 Score: -137.29268996860299
Episode 31 	 Score: -65.63572092396012
Episode 32 	 Score: -168.665926013045
Episode 33 	 Score: -125.07257419683921
Episode 34 	 Score: -121.09132118396681
Episode 35 	 Score: -123.18350033292603
Episode 36 	 Score: -315.65999825996437
Episode 37 	 Score: -100.35840255126111
Episode 38 	 Score: -205.88547379380026
Episode 39 	 Score: -192.13872376723495
Episode 40 	 Score: -103.51307569436791
Episode 40 	 avg length: 93 	 reward: -168
Episode 41 	 Score: -125.95466034513775
Episode 42 	 Score: -136.79008809153186
Episode 43 	 Score: -191.16415752717097
Episode 44 	 Score: -84.85874315575708
Episode 45 	 Score: -9.674244583770388
Episod