<a href="https://colab.research.google.com/github/DanielOe21/Tutorial-Actor-critic/blob/main/PolicyGradients_TF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependecies to Render OpenAI Gym Environment

In [1]:
#pip install --upgrade tensorflow==2.8

In [2]:
# Run this asap since it takes 30 seconds
%%capture
!pip install pyglet==1.3.2
!pip install box2d-py
!pip install gym pyvirtualdisplay
!apt-get install -y xvfb python-opengl ffmpeg
!pip install tensorflow==2.8.*
!pip install --upgrade tensorflow-probability
import gym
from gym.wrappers import Monitor
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import random
import glob
import io
import time
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900));
display.start();

# Helper functions to visualize the performance of the agent

In [3]:
def show_video():
  """Enables video recording of gym environment and shows it."""
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Video not found")
    
def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

#### Check that there is a GPU avaiable

In [4]:
gpu_list = tf.config.experimental.list_physical_devices('GPU')
print('Number of GPUS available is {}'.format(len(gpu_list)))

Number of GPUS available is 1


# Start the Environment and Build the Policy Gradient Agent

In [5]:
env = wrap_env(gym.make('Acrobot-v1'))
num_features = env.observation_space.shape[0]
num_actions = env.action_space.n
print('Number of state features: {}'.format(num_features))
print('Number of possible actions: {}'.format(num_actions))

Number of state features: 6
Number of possible actions: 3


In [6]:
# Create Neural Network for Policy Gradient-based Agent
class Network(tf.keras.Model):
  def __init__(self):
    super(Network, self).__init__()
    self.dense1 = tf.keras.layers.Dense(32, activation='relu')
    self.out = tf.keras.layers.Dense(num_actions)
    self.dist = tfp.distributions.Categorical
  
  def call(self, x):
    x = self.dense1(x)
    logits = self.out(x)
    action = self.dist(logits=logits).sample()
    probs = tf.nn.softmax(logits, axis=-1)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    return logits, action, probs, log_probs

net = Network()
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-2)

# Set Up Function to Perform a Training Step

In [7]:
@tf.function
def train_step(batch_states, batch_actions, batch_returns):
  with tf.GradientTape() as tape:
    logits, actions, probs, log_probs = net(batch_states)
    action_masks = tf.one_hot(batch_actions, num_actions)
    masked_log_probs = tf.reduce_sum(action_masks * log_probs, axis=-1)
    loss = -tf.reduce_mean(batch_returns * masked_log_probs)
  net_gradients = tape.gradient(loss, net.trainable_variables)
  optimizer.apply_gradients(zip(net_gradients, net.trainable_variables))
  return loss

# Start running the algorithm and see How it learns

In [8]:
num_episodes = 1000 # @param {type:"integer"}
viz_update_freq = 50 # @param {type: "integer"}
steps_per_train_step = 5000 # @param {type: "integer"}

In [9]:
last_100_ep_ret, text = [], ''
batch_states, batch_actions, batch_returns = [], [], []
for episode in range(num_episodes):
  if episode % viz_update_freq == 0: # Needed for updating the visualization.
    env.close()
    env = wrap_env(gym.make('Acrobot-v1'))
  
  # Start a new episode and reset the environment.
  state = env.reset()
  done, ep_rew = False, []
  while not done:
    state_in = np.expand_dims(state, 0)
    # Sample action from policy and take that action in the env.
    logits, action, probs, log_probs = net(state_in)
    next_state, reward, done, info = env.step(action[0].numpy())
    batch_states.append(state)
    batch_actions.append(action[0])
    ep_rew.append(reward)
    state = next_state
    
  # Create episode returns for policy gradient step.
  episode_ret = sum(ep_rew)
  episode_len = len(ep_rew)
  batch_returns += [episode_ret] * episode_len
  
  # Keep collecting experience with the current policy.
  if len(batch_states) >= steps_per_train_step:
    # Now that we have enough experience for this policy, train it on-policy.
    loss = train_step(np.array(batch_states), np.array(batch_actions),
                      np.array(batch_returns, dtype=np.float32))
    # Print the performance of the policy.
    ipythondisplay.clear_output()
    text += f"Episode: {episode}, Loss: {loss:.2f}, "\
            f"Return: {np.mean(batch_returns):.2f}\n"
    print(text)
    print('Current agent performance:')
    show_video()
    batch_states, batch_actions, batch_returns = [], [], []

Episode: 9, Loss: -543.28, Return: -500.00
Episode: 19, Loss: -538.82, Return: -500.00
Episode: 29, Loss: -522.80, Return: -500.00
Episode: 39, Loss: -506.49, Return: -500.00
Episode: 49, Loss: -492.96, Return: -500.00
Episode: 59, Loss: -481.03, Return: -500.00
Episode: 69, Loss: -478.06, Return: -500.00
Episode: 79, Loss: -478.44, Return: -500.00
Episode: 89, Loss: -458.65, Return: -500.00
Episode: 99, Loss: -466.19, Return: -500.00
Episode: 109, Loss: -443.51, Return: -500.00
Episode: 119, Loss: -441.46, Return: -500.00
Episode: 129, Loss: -420.90, Return: -500.00
Episode: 139, Loss: -418.42, Return: -500.00
Episode: 149, Loss: -422.98, Return: -500.00
Episode: 160, Loss: -371.34, Return: -490.66
Episode: 170, Loss: -364.83, Return: -500.00
Episode: 180, Loss: -391.02, Return: -500.00
Episode: 190, Loss: -387.84, Return: -500.00
Episode: 200, Loss: -391.58, Return: -500.00
Episode: 210, Loss: -394.69, Return: -500.00
Episode: 220, Loss: -403.55, Return: -500.00
Episode: 230, Loss: -

KeyboardInterrupt: ignored

# Visualize performance of fully trained agent
### Run multiple times to play the game again and display the result.

In [None]:
env = wrap_env(gym.make('Acrobot-v1'))
state = env.reset()
ret = 0
while True:
  env.render()
  state = tf.expand_dims(state, axis=0)
  logits, action, probs, log_probs = net(state)
  state, reward, done, info = env.step(action[0].numpy())
  ret += reward
  if done:
    break
env.close()
print('Return on this episode: {}'.format(ret))
show_video()