# Install Dependecies to Render OpenAI Gym Environment

In [1]:
# Run this asap since it takes 30 seconds.
%%capture
!apt-get update
!pip install pyglet
!pip install gym pyvirtualdisplay
!pip install xvfbwrapper
!apt-get install -y xvfb python-opengl ffmpeg
!pip install tensorflow==2.1.* 


In [2]:
import gym

from gym.wrappers.record_video import RecordVideo
from collections import deque
import tensorflow as tf
import numpy as np
import random
import math
import time
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f97e0a71310>

# Build DQN Agent and Helper Functions

In [3]:
# Load gym environment and get action and state spaces.
env = gym.make('CartPole-v1')
num_features = env.observation_space.shape[0]
num_actions = env.action_space.n
print('Number of state features: {}'.format(num_features))
print('Number of possible actions: {}'.format(num_actions))

Number of state features: 4
Number of possible actions: 2


  and should_run_async(code)
  deprecation(
  deprecation(


In [4]:
class DQN(tf.keras.Model):
  """Dense neural network class."""
  def __init__(self):
    super(DQN, self).__init__()
    self.dense1 = tf.keras.layers.Dense(32, activation="relu")
    self.dense2 = tf.keras.layers.Dense(32, activation="relu")
    self.dense3 = tf.keras.layers.Dense(num_actions, dtype=tf.float32) # No activation
    
  def call(self, x):
    """Forward pass."""
    x = self.dense1(x)
    x = self.dense2(x)
    return self.dense3(x)

main_nn = DQN()
target_nn = DQN()

optimizer = tf.keras.optimizers.Adam(1e-4)
mse = tf.keras.losses.MeanSquaredError()

In [5]:
class ReplayBuffer(object):
  """Experience replay buffer that samples uniformly."""
  def __init__(self, size):
    self.buffer = deque(maxlen=size)

  def add(self, state, action, reward, next_state, done):
    self.buffer.append((state, action, reward, next_state, done))

  def __len__(self):
    return len(self.buffer)

  def sample(self, num_samples):
    states, actions, rewards, next_states, dones = [], [], [], [], []
    idx = np.random.choice(len(self.buffer), num_samples)
    for i in idx:
      elem = self.buffer[i]
      state, action, reward, next_state, done = elem
      states.append(np.array(state, copy=False))
      actions.append(np.array(action, copy=False))
      rewards.append(reward)
      next_states.append(np.array(next_state, copy=False))
      dones.append(done)
    states = np.array(states)
    actions = np.array(actions)
    rewards = np.array(rewards, dtype=np.float32)
    next_states = np.array(next_states)
    dones = np.array(dones, dtype=np.float32)
    return states, actions, rewards, next_states, dones

In [6]:
def select_epsilon_greedy_action(state, epsilon):
  """Take random action with probability epsilon, else take best action."""
  result = tf.random.uniform((1,))
  if result < epsilon:
    return env.action_space.sample() # Random action (left or right).
  else:
    return tf.argmax(main_nn(state)[0]).numpy() # Greedy action for state.

In [7]:
@tf.function
def train_step(states, actions, rewards, next_states, dones):
  """Perform a training iteration on a batch of data sampled from the experience
  replay buffer."""
  # Calculate targets.
  next_qs = target_nn(next_states)
  max_next_qs = tf.reduce_max(next_qs, axis=-1)
  target = rewards + (1. - dones) * discount * max_next_qs
  with tf.GradientTape() as tape:
    qs = main_nn(states)
    action_masks = tf.one_hot(actions, num_actions)
    masked_qs = tf.reduce_sum(action_masks * qs, axis=-1)
    loss = mse(target, masked_qs)
  grads = tape.gradient(loss, main_nn.trainable_variables)
  optimizer.apply_gradients(zip(grads, main_nn.trainable_variables))
  return loss

# Start running the DQN algorithm and see how the algorithm learns.

In [8]:
# Hyperparameters.
num_episodes = 1000
epsilon = 1.0
batch_size = 32
discount = 0.99
buffer = ReplayBuffer(100000)
cur_frame = 0

# Start training. Play game once and then train with a batch.
last_100_ep_rewards = []
for episode in range(num_episodes+1):
  state = env.reset()
  ep_reward, done = 0, False
  while not done:
    state_in = tf.expand_dims(state, axis=0)
    action = select_epsilon_greedy_action(state_in, epsilon)
    next_state, reward, done, info = env.step(action)
    ep_reward += reward
    # Save to experience replay.
    buffer.add(state, action, reward, next_state, done)
    state = next_state
    cur_frame += 1
    # Copy main_nn weights to target_nn.
    if cur_frame % 2000 == 0:
      target_nn.set_weights(main_nn.get_weights())

    # Train neural network.
    if len(buffer) >= batch_size:
      states, actions, rewards, next_states, dones = buffer.sample(batch_size)
      loss = train_step(states, actions, rewards, next_states, dones)
  
  if episode < 950:
    epsilon -= 0.001

  if len(last_100_ep_rewards) == 100:
    last_100_ep_rewards = last_100_ep_rewards[1:]
  last_100_ep_rewards.append(ep_reward)
    
  if episode % 50 == 0:
    print(f'Episode {episode}/{num_episodes}. Epsilon: {epsilon:.3f}. '
          f'Reward in last 100 episodes: {np.mean(last_100_ep_rewards):.3f}')
env.close()

Episode 0/1000. Epsilon: 0.999. Reward in last 100 episodes: 25.000
Episode 50/1000. Epsilon: 0.949. Reward in last 100 episodes: 19.000
Episode 100/1000. Epsilon: 0.899. Reward in last 100 episodes: 20.370
Episode 150/1000. Epsilon: 0.849. Reward in last 100 episodes: 22.450
Episode 200/1000. Epsilon: 0.799. Reward in last 100 episodes: 22.750
Episode 250/1000. Epsilon: 0.749. Reward in last 100 episodes: 25.000
Episode 300/1000. Epsilon: 0.699. Reward in last 100 episodes: 34.770
Episode 350/1000. Epsilon: 0.649. Reward in last 100 episodes: 43.630
Episode 400/1000. Epsilon: 0.599. Reward in last 100 episodes: 57.980
Episode 450/1000. Epsilon: 0.549. Reward in last 100 episodes: 76.860
Episode 500/1000. Epsilon: 0.499. Reward in last 100 episodes: 106.710
Episode 550/1000. Epsilon: 0.449. Reward in last 100 episodes: 139.260
Episode 600/1000. Epsilon: 0.399. Reward in last 100 episodes: 171.220
Episode 650/1000. Epsilon: 0.349. Reward in last 100 episodes: 204.250
Episode 700/1000. E

# Display Result of Trained DQN Agent on Cartpole Environment

In [9]:
def show_video():
  """Enables video recording of gym environment and shows it."""
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Video not found")
    

def wrap_env(env):
  # env = Monitor(env, './video', force=True)
  env = RecordVideo(env, './video',  episode_trigger = lambda episode_number: True)
  return env

In [10]:
# env = wrap_env(gym.make('CartPole-v1', render_mode='human'))
env = wrap_env(gym.make('CartPole-v1', render_mode="rgb_array"))
new_step_api=True
state = env.reset()
done = False
ep_rew = 0
while not done:
  env.render()
  state = tf.expand_dims(state, axis=0)
  action = select_epsilon_greedy_action(state, epsilon=0.01)
  state, reward, done, info = env.step(action)
  ep_rew += reward
print('Episode reward was {}'.format(ep_rew))
env.close()
show_video()

Episode reward was 242.0


  and should_run_async(code)
