<a href="https://colab.research.google.com/github/DeepLearningVision-2019/DLV-Course-Material/blob/master/Notebooks/Reinforcement/Policy_gradients.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#remove " > /dev/null 2>&1" to see what is going on under the hood
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
!pip install gym[box2d] > /dev/null 2>&1

Requirement already up-to-date: setuptools in /usr/local/lib/python3.6/dist-packages (41.0.1)


In [2]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor

import matplotlib
import matplotlib.pyplot as plt

import cv2
import numpy as np
import random, math

from keras import models, layers, optimizers

from collections import deque

import glob, io, base64

from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display

gymlogger.set_level(40) #error only
%matplotlib inline

Using TensorFlow backend.


In [0]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [4]:
display = Display(visible=0, size=(1400, 900))
display.start()

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [5]:
# Loads the cartpole environment
env = wrap_env(gym.make('PongDeterministic-v4'))

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

print(state_size, action_size)

actions = env.unwrapped.get_action_meanings()

# right is up, left is down
print(actions)

batch_size = 32

n_episodes = 10000

print(np.random.choice([2,3]))

up_action = 2
down_action = 3

210 6
['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']
3


In [0]:
env = wrap_env(gym.make('PongDeterministic-v4'))
observation = env.reset()

while True:
  
    env.render()
    
    #your agent goes here
    action = np.random.choice([2,3])
    #action = env.action_space.sample() 
    
    observation, reward, done, info = env.step(action) 

    if done: 
      break;
            
env.close()
show_video()

In [0]:
def discount_rewards(reward, gamma):
  
  r = np.array(reward)
  discounted_r = np.zeros_like(r)
  running_add = 0
    
  for t in reversed(range(0, r.size)):
    
    if r[t] != 0: running_add = 0 # if the game ended (in Pong), reset the reward sum
    running_add = running_add * gamma + r[t] # the point here is to use Horner's method to compute those rewards efficiently
    discounted_r[t] = running_add
  
  discounted_r -= np.mean(discounted_r) #normalizing the result
  discounted_r /= np.std(discounted_r) #idem
  
  return discounted_r

In [0]:
class DQNAgent:
    
    def __init__(self, state_size, action_size):
      
        self.state_size = state_size
        self.action_size = action_size
                
        self.gamma = 0.99
        
        self.model = self._build_model()
        

    def _build_model(self):
        
        model = models.Sequential()
        
        model.add(layers.Dense(200,input_dim=80*80, activation='relu', kernel_initializer='glorot_uniform'))

        
        model.add(layers.Dense(1, activation='sigmoid', kernel_initializer='RandomNormal'))

        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        return model
            
    def train(self, states, labels, rewards):
        #sample_weight: Optional Numpy array of weights for the training samples, used for weighting the loss function (during training only). 
        self.model.fit(x=np.vstack(states), y = np.vstack(labels), verbose = 0, sample_weight = discount_rewards(rewards, self.gamma))
    
           
    def load(self, name):
        self.model.load_weights(name)
        
    def save(self, name):
        self.model.save_weights(name)

In [11]:
agent = DQNAgent(state_size, action_size)
agent.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 200)               1280200   
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 201       
Total params: 1,280,401
Trainable params: 1,280,401
Non-trainable params: 0
_________________________________________________________________


In [0]:
def preprocessFrame(image):
  
  """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
  image = image[35:195] # crop
  image = image[::2,::2,0] # downsample by factor of 2
  image[image == 144] = 0 # erase background (background type 1)
  image[image == 109] = 0 # erase background (background type 2)
  image[image != 0] = 1 # everything else (paddles, ball) just set to 1
  
  return image.astype(np.float).ravel()

In [0]:
env = wrap_env(gym.make('PongDeterministic-v4'))
running_reward = None

try:
    for e in range(n_episodes):
        
        states_train, labels_train, rewards = [], [], []
        total_reward = 0
        
        next_state = env.reset()
        prev_state = None
        
        done = False
        
        while not done:
            
            #env.render()
            current_state = preprocessFrame(next_state)
            
            delta_state = current_state - prev_state if prev_state is not None else np.zeros(80*80)
            
            prev_state =  current_state
            
            prob_up = agent.model.predict(np.expand_dims(delta_state, axis = 1).T)
            
            action = up_action if np.random.uniform() < prob_up else down_action
                                          
            label = 1 if action == 2 else 0
                                          
            states_train.append(delta_state)
            labels_train.append(label)
            
            next_state, reward, done, _ = env.step(action)
            rewards.append(reward)
            total_reward += reward
                                          
        if e % 100 == 0:
          running_reward = total_reward if running_reward is None else running_reward * 0.99 + total_reward * 0.01
          print('Episode: {} Reward {}'.format(e, total_reward))
          agent.save('model_weights_{}_{}_.hdf5'.format(e, running_reward))
          
        agent.train(states_train, labels_train, rewards)

        
finally:
    env.close()

Episode: 0 Reward -21.0
Episode: 100 Reward -21.0
Episode: 200 Reward -21.0
Episode: 300 Reward -21.0
Episode: 400 Reward -21.0
Episode: 500 Reward -21.0
Episode: 600 Reward -21.0
Episode: 700 Reward -21.0
Episode: 800 Reward -21.0
Episode: 900 Reward -21.0
Episode: 1000 Reward -21.0
Episode: 1100 Reward -21.0
Episode: 1200 Reward -21.0
Episode: 1300 Reward -21.0
Episode: 1400 Reward -21.0
Episode: 1500 Reward -21.0
Episode: 1600 Reward -21.0
Episode: 1700 Reward -21.0
Episode: 1800 Reward -21.0
Episode: 1900 Reward -21.0
Episode: 2000 Reward -21.0
Episode: 2100 Reward -21.0
Episode: 2200 Reward -21.0
Episode: 2300 Reward -21.0
Episode: 2400 Reward -21.0
Episode: 2500 Reward -21.0
Episode: 2600 Reward -21.0
Episode: 2700 Reward -21.0
Episode: 2800 Reward -21.0
Episode: 2900 Reward -21.0
Episode: 3000 Reward -21.0
Episode: 3100 Reward -21.0
Episode: 3200 Reward -21.0
Episode: 3300 Reward -21.0
Episode: 3400 Reward -21.0
Episode: 3500 Reward -21.0
Episode: 3600 Reward -21.0
Episode: 3700

In [0]:
from google.colab import files
files.download('model_weights.hdf5')

In [0]:
env = wrap_env(gym.make('PongDeterministic-v4'))
#agent.load('0700hdf5')

try:
      state = env.reset()
      state = np.reshape(state, [1, state_size])

      total_reward = 0
      done = False

      while not done:

          env.render()

          # Takes a random action from the action space of the environment
          action = agent.action(state)

          next_state, reward, done, info = env.step(action)

          total_reward += reward

          next_state = np.reshape(next_state, [1, state_size])
          state = next_state
        
finally:
    env.close()       
    show_video()