In [1]:
import tensorflow as tf
import numpy as np
import gym
import os

  from ._conv import register_converters as _register_converters


# Tensorflow PPO

Begin by selecting an environment. In this case LunarLander v2.

Gather from the environment the state and action dimensions required.

Set up some constants that will go into our PPO implementation.

In [2]:
env = gym.make('LunarLander-v2')
sdim = env.observation_space.shape[0] #state dimension
adim = env.action_space.n #action space dimension
LR = 0.00025         #learning rate
CLIP = 0.2           #clip parameter
GAMMA = 0.99         #discount gamma
LAYERSIZE = 64       #layer size of networks
vscale = 0.5         #scale constant for value function loss
escale = 0.03        #scale constant for entropy loss
print('action',adim, ' state',sdim)

action 4  state 8


The following block is a ppo class written in tensorflow.

References:
    - https://github.com/nikhilbarhate99/PPO-PyTorch/blob/master/PPO.py
    - https://github.com/DavidCastilloAlvarado/PPO_reinforcement_learning/blob/master/PPO_pendulum.py
    - https://blog.varunajayasiri.com/ml/ppo_pytorch.html
    - (paper) https://arxiv.org/abs/1707.06347
    
   

In [24]:
class PPO:
    def __init__(self):
        
        #----------------------------------------------------------––#
        # input tensors
        tf.reset_default_graph()
        
        self.session = tf.Session()
        
        self.state = tf.placeholder(dtype=tf.float32, shape = [None, sdim], name='state')
        
        self.action = tf.placeholder(dtype=tf.float32, shape = [None], name='actions')
        
        self.advantage = tf.placeholder(dtype=tf.float32, shape = [None], name='advantages')
        
        self.reward = tf.placeholder(dtype=tf.float32, shape = [None], name='values')
        
        self.old_logprob = tf.placeholder(dtype=tf.float32, shape = [None], name='old_action_logprobs')
        
        #----------------------------------------------------------––#
        # neural networks
        with tf.variable_scope("V_net"):
            # neural network for value function estimation
            h1 = tf.layers.dense(self.state,LAYERSIZE,activation=tf.nn.tanh)
            h2 = tf.layers.dense(h1,LAYERSIZE,activation=tf.nn.tanh)
            self.v = tf.squeeze(tf.layers.dense(h2, 1))
        
        with tf.variable_scope('policy_net'):
            # neural network for policy distribution and output
            h1 = tf.layers.dense(self.state,LAYERSIZE,activation=tf.nn.tanh)
            h2 = tf.layers.dense(h1,LAYERSIZE,activation=tf.nn.tanh)
            self.p_out = tf.nn.softmax(tf.layers.dense(h2, adim),axis=-1)  #output policy probabilities
            self.p_dist = tf.distributions.Categorical(probs=self.p_out)   #converts these into a distribution type
            self.p_action = tf.squeeze(self.p_dist.sample(1))              #samples action from distribution
        
        #given an action and the current network params, what is its probability?
        self.a_logprob = self.p_dist._log_prob(self.action) 

        #----------------------------------------------------------––#
        # PPO loss
        
        self.diff = self.a_logprob - self.old_logprob
        self.ratio = tf.exp(self.diff)
        self.unclipped = self.ratio * self.advantage
        self.clipped = tf.clip_by_value(self.ratio, 1. - CLIP, 1. + CLIP) * self.advantage
        self.minclip = tf.minimum(self.clipped,self.unclipped)
        
        self.ploss = -1 * self.minclip                         # L_clip (note sign)

        self.eloss = -1 * self.p_dist.entropy() * escale       # L_entropy (note sign)
        
        self.verr = tf.square(self.reward - self.v)            # L_value
        self.vloss = self.verr * vscale
        
        self.policy_loss = self.ploss + self.eloss
        
        self.loss = self.ploss + self.eloss + self.vloss       # L = L_clip + L_entropy + L_value
        
        self.policy_train_op = tf.train.AdamOptimizer(LR).minimize(self.policy_loss)      # train policy
        
        self.value_train_op = tf.train.AdamOptimizer(LR).minimize(self.vloss)             # train value
        
        self.train_op = tf.train.AdamOptimizer(LR).minimize(self.loss)                    # train everything
        
        self.session.run(tf.global_variables_initializer())
        
        self.saver = tf.train.Saver()

        
    def check(self,states,actions,rewards,advantages,old_logprobs):
        """
        Used to check some outputs of the networks
        """
        return self.session.run([
            self.p_out,self.v,self.verr,self.minclip
        ],
                                {
                                    self.state:states,
                                     self.reward:rewards,
                                     self.advantage:advantages,
                                     self.action:actions,
                                     self.old_logprob:old_logprobs
        })

    def get_action(self,states):
        """
        Given a state (or collection of states) and current state of networks, output an action sampled from the
        resulting distribution
        """
        return self.session.run(self.p_action, {self.state:states})
    
    def get_action_prob(self,states, actions):
        """
        Returns the probability of taking the action given the state.
        """
        return self.session.run(self.a_logprob, {self.state:states, self.action:actions})
    
    def get_value(self,states):
        """
        Returns the estimated value of a state
        """
        return self.session.run(self.v, {self.state:states})
    
    def __train_policy(self, states, actions, advantages, old_logprobs):
        """
        Train the policy only
        """
        self.session.run(self.policy_train, {self.state:states, self.action:actions, self.advantage:advantages,
                                            self.old_logprob:old_logprobs})
        
    def __train_value(self, states, rewards):
        """
        Train the value estimator only
        """
        self.session.run(self.value_train, {self.state:states, self.reward:rewards})
        
    def __train(self, states, actions, rewards, advantages, old_logprobs):
        """
        Train the ppo loss
        L_clip + L_entropy + L_value
        """
        self.session.run(self.train_op, {
            self.state:states,
            self.action:actions,
            self.advantage:advantages,
            self.old_logprob:old_logprobs,
            self.reward:rewards
        })
        
    def get_policy_loss(self,states,actions,advantages,old_logprobs):
        """
        Given a bunch of states, actions, advantages and old_action probabilities, 
        returns the policy loss L_clip+L_entropy
        """
        return self.session.run(self.policy_loss, {self.state:states,self.action:actions,
                                           self.advantage:advantages, self.old_logprob:old_logprobs})
        
    def get_value_loss(self,states, rewards):
        """
        Given states and rewards, returns the value loss (L_value)
        """
        return self.session.run(self.vloss, {self.state:states,self.reward:rewards})
    
    @staticmethod
    def discount(rewards, dones, norm=True):
        """
        Static method: given some rewards, and dones (a list of true/falses indicating when an episode ended)
        it returns the discounted rewards according to GAMMA
        """
        drs = []
        discounted_reward = 0
        for reward, done in zip(reversed(rewards), reversed(dones)):
            if done:
                discounted_reward = 0
            discounted_reward = reward + (GAMMA * discounted_reward)
            drs.insert(0, discounted_reward)
        drs = np.squeeze(np.array(drs))
        if norm:
            drs = (drs - np.mean(drs)) / (np.std(drs) + 1e-8)
        return drs
    
    @staticmethod
    def batcher(iterable, n=1):
        """static method, creates batches out of lists"""
        l = len(iterable)
        for ndx in range(0, l, n):
            yield iterable[ndx:min(ndx + n, l)]
            
    def save(self, path='/tmp', name='model.ckpt', verbose=False):
        """
        Save the model in target path/folder
        """
        if '.ckpt' not in name:
            name = '{}.ckpt'.format(name)
        self.saver.save(self.session, os.path.join(path, name))
        if verbose:
            print('saved in {}/{}'.format(path, name))

    def restore(self, path='/tmp', name='model.ckpt', verbose=False):
        """
        Restore model from target path
        """
        self.saver.restore(self.session, os.path.join(path, name))
        if verbose:
            print('model restored from {}/{}'.format(path, name))

        
    def train(self, states, actions, rewards, dones, train_iters, minibatch):
        """
        Main method:
        Trains the ppo loss.
        
        Requires:
            - train_iters: number of training loops
            - minibatch (commented out): minibatches to run adam with.
            - states, actions, rewards and dones (for loss)
        
        Before training we do the following:
            1) Using rewards and dones we determine the discounted rewards.
            2) Using states and actions we get the log probabilities of the action given the state 
               given the network BEFORE IT TRAINS. These are our 'old' probabilities
        While we train we do:
            1) get estimated values given states
            2) use discounted rewards and these estiamted values to get an advantage.
            3) train with states, actions, rewards, advantages and old probabilities
            
        Returns the losses after training occurred.
        
        """
        discounted_rewards = self.discount(rewards,dones)
        old_logprobs = self.get_action_prob(states,actions)
        vlosses = []
        plosses = []
        for i in range(train_iters):
            values = self.get_value(states)
            advantages = discounted_rewards - values
#             for batch_idx in self.batcher(range(0, actions.shape[0]), minibatch):
#                 state_batch = states[batch_idx]
#                 action_batch = actions[batch_idx]
#                 old_logprob_batch = old_logprobs[batch_idx]
#                 advantage_batch = advantages[batch_idx]
#                 reward_batch = discounted_rewards[batch_idx]
#                 self.__train(state_batch,action_batch,reward_batch,advantage_batch,old_logprob_batch)
            self.__train(states,actions,rewards,advantages,old_logprobs)
            vlosses.append(self.get_value_loss(states,discounted_rewards))
            plosses.append(self.get_policy_loss(states,actions,advantages,old_logprobs))
        return np.mean(vlosses), np.mean(plosses)
        

Here we run the main loop.

In [5]:
#create env
env = gym.make('LunarLander-v2')

#set some params
iters = 100                      # sampling iterations
episode_per_iter = 300           # number of episodes to run for each iteration
max_episode_steps = 300          # maximum steps allowed per episode
minibatch = 32                   # minibatch size
train_iters = 4                  # n of training loops per iteration

# start ppo
ppo = PPO()

for i in range(iters):
    rewards = []      # all rewards this iter
    dones = []        # all dones this iter
    actions = []      # all actions this iter
    states = []       # all states this iter
    
    e_lengths = []    # keeps track of this iteration's episode lengths
    e_rewards = []    # keep track of this iteration's episode rewards
    for e in range(episode_per_iter):
        # run episode
        state = env.reset().reshape(1,-1)  #reset
        done = False
        el = 0                                                    #total episode length
        er = 0                                                    #total episode reward
        while not done:
            el +=1                                                
            action = ppo.get_action(state)                        # sample action from ppo net
            next_state, reward, done, _ = env.step(action)        # run a step in the env
            er += reward
            states.append(state)                                  # record old state
            state = next_state.reshape(1,-1)                      # set new state as state
            actions.append(action)                                # record action taken
            rewards.append(reward)                                # record reward obtained
            if el > max_episode_steps:
                done = True                                       # if we exceed max allowed steps: stop
            dones.append(done)
        e_lengths.append(el)
        e_rewards.append(er)
            
    
    
    states = np.squeeze(np.array(states))
    actions = np.squeeze(np.array(actions))
    
    #train
    vloss,ploss = ppo.train(states, actions, rewards, dones, train_iters, minibatch)

    #print progress
    print('iter: {0} | Ploss: {1:.4f} | Vloss: {2:.4f} | avg R: {3:.2f} | Best R: {4:.2f} | Avg episode Length: {5:.2f}'.format(
        i,
        np.nan if not ploss else ploss,
        np.nan if not vloss else vloss,
        np.mean(e_rewards),
        np.max(e_rewards),
        np.mean(e_lengths)
        
    ))

Instructions for updating:
keep_dims is deprecated, use keepdims instead
iter: 0 | Ploss: -0.0717 | Vloss: 0.4431 | avg R: -333.20 | Best R: 45.02 | Avg episode Length: 87.50
iter: 1 | Ploss: -0.1588 | Vloss: 0.4689 | avg R: -279.69 | Best R: 30.89 | Avg episode Length: 85.72
iter: 2 | Ploss: -0.2464 | Vloss: 0.4966 | avg R: -254.80 | Best R: 14.82 | Avg episode Length: 83.16
iter: 3 | Ploss: -0.3386 | Vloss: 0.5296 | avg R: -222.29 | Best R: 137.63 | Avg episode Length: 85.04
iter: 4 | Ploss: -0.4252 | Vloss: 0.5788 | avg R: -212.34 | Best R: 39.35 | Avg episode Length: 84.66
iter: 5 | Ploss: -0.5071 | Vloss: 0.6194 | avg R: -193.99 | Best R: 38.93 | Avg episode Length: 85.24
iter: 6 | Ploss: -0.5825 | Vloss: 0.6688 | avg R: -179.00 | Best R: 35.39 | Avg episode Length: 86.02
iter: 7 | Ploss: -0.6490 | Vloss: 0.7086 | avg R: -173.99 | Best R: 36.54 | Avg episode Length: 86.83
iter: 8 | Ploss: -0.7166 | Vloss: 0.7557 | avg R: -168.03 | Best R: 49.01 | Avg episode Length: 86.63
iter: 9 

iter: 77 | Ploss: -0.0391 | Vloss: 1.0777 | avg R: 46.55 | Best R: 173.43 | Avg episode Length: 210.37
iter: 78 | Ploss: -0.0483 | Vloss: 1.1035 | avg R: 52.96 | Best R: 187.05 | Avg episode Length: 216.65
iter: 79 | Ploss: -0.0457 | Vloss: 1.0807 | avg R: 54.48 | Best R: 200.74 | Avg episode Length: 220.27
iter: 80 | Ploss: -0.0475 | Vloss: 1.0728 | avg R: 59.79 | Best R: 200.58 | Avg episode Length: 228.67
iter: 81 | Ploss: 0.0364 | Vloss: 1.0152 | avg R: 53.54 | Best R: 201.94 | Avg episode Length: 213.19
iter: 82 | Ploss: 0.0322 | Vloss: 1.0024 | avg R: 61.60 | Best R: 189.15 | Avg episode Length: 228.46
iter: 83 | Ploss: 0.0461 | Vloss: 0.9704 | avg R: 63.57 | Best R: 201.56 | Avg episode Length: 226.65
iter: 84 | Ploss: 0.1081 | Vloss: 0.9389 | avg R: 56.00 | Best R: 187.77 | Avg episode Length: 220.31
iter: 85 | Ploss: 0.1473 | Vloss: 0.9727 | avg R: 54.15 | Best R: 194.89 | Avg episode Length: 212.17
iter: 86 | Ploss: 0.1515 | Vloss: 0.9618 | avg R: 53.27 | Best R: 173.00 | Avg

#### From the logs its clear that learning is occurring. 

Average full-episode-reward per iteration started around -200 and finished after 100 iters at ~+65

Algorithm can easily be improved by changing the hyper parametes

Also, the value estimator and policy networks can be changed to use the same hidden layers.

Below you can use code to see how the algorithm performs in the environment

In [None]:
from gym import wrappers

env = gym.make('LunarLander-v2')
env = wrappers.Monitor(env, "./gym-results", force=True)
state = env.reset()
episode_steps = 0
episode_reward = 0
done = False
actions = []
rewards = []
c = 0
while not done:
    state = state.reshape(1,-1)
    action = ppo.get_action(state)
    actions.append(action)
    new_state,reward,done,_ = env.step(action)
    rewards.append(reward)
    episode_reward += reward
    state = new_state
    if episode_steps > 200:
        done = True

print(episode_reward)
env.close()


In [16]:
import io
import base64
from IPython.display import HTML

video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))