## Ezana N. Beyenne
##MSDS 462 - Module 2 - Open AI MsPacman

# install dependancies, takes around 45 seconds

Rendering Dependancies



In [2]:
#remove " > /dev/null 2>&1" to see what is going on under the hood
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

Pacman Dependancies

In [3]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1

Requirement already up-to-date: setuptools in /usr/local/lib/python3.7/dist-packages (54.2.0)


# Imports and Helper functions


In [8]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay

In [9]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7fccc60c1950>

In [11]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

# Pacman!

In [35]:
env1 = wrap_env(gym.make("MsPacman-v0"))

In [36]:
# 9 discrete actions that can be taken by the joystick
# and the meanings of these 9 discrete actions
print(env1.action_space)
print(env1.env.get_action_meanings())
print(env1.observation_space)

Discrete(9)
['NOOP', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'UPRIGHT', 'UPLEFT', 'DOWNRIGHT', 'DOWNLEFT']
Box(0, 255, (210, 160, 3), uint8)


In [37]:
observation = env1.reset()

while True:
  
    env1.render()
    
    #your agent goes here
    action = env1.action_space.sample() 
         
    observation, reward, done, info = env1.step(action) 
        
    if done: 
      break;

       
env1.close()
show_video()

# Pacman 2 with Q learning and greedy epsilon strategy
References:
<ul>
 <ol>1. https://www.oreilly.com/radar/introduction-to-reinforcement-learning-and-openai-gym/</ol>
 <ol>2. https://github.com/mcgovey/openai-gym-pacman-q-learning/blob/master/Rendering_OpenAi_Gym_in_Colaboratory.ipynb</ol>
</ul>

In [30]:
env = wrap_env(gym.make("MsPacman-v0"))

In [31]:
# 9 discrete actions that can be taken by the joystick
# and the meanings of these 9 discrete actions
print(env.action_space)
print(env.env.get_action_meanings())
print(env.observation_space)

Discrete(9)
['NOOP', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'UPRIGHT', 'UPLEFT', 'DOWNRIGHT', 'DOWNLEFT']
Box(0, 255, (210, 160, 3), uint8)


In [32]:
# Set up Q table using Numpy Array and the value of the observeration space ( 210 * 160, action space)
Q = np.zeros([210*160, env.action_space.n])
# setup the learning rate, default is 0.618
alpha = 0.618
# Epsilon greedy strategy
epsilon = 0.8
# Accumulated reward defined as G
G = 0
state = env.reset()

In [33]:
for episode in range(1,101):
    done = False
    G, reward = 0,0
    state = env.reset()
    while done != True:
            #1. Choose action with the highest Q value, initial values will be zero
            #   Argmax will return the index/action with the highest value for that state
            #   take less random steps as you learn more about the game
            #   epsilon greedy strategy
            if random.random() < (1 - epsilon): 
              action = random.randint(0,env.action_space.n-1)
            else:
              action = np.argmax(Q[state]) 
            #2.The agent then takes action and we store the future state as state2 (St+1). 
            #  This will allow the agent to compare the previous state to the new state.
            state2, reward, done, info = env.step(action)
            #3 We update the state-action pair
            Q[state,action] += alpha * (reward + np.max(Q[state2]) - Q[state,action]) #3
            G += reward
            state = state2   
    if episode % 5 == 0:
        print('Episode {} Total Reward: {}'.format(episode,G))

Episode 5 Total Reward: 1100.0
Episode 10 Total Reward: 140.0
Episode 15 Total Reward: 240.0
Episode 20 Total Reward: 320.0
Episode 25 Total Reward: 630.0
Episode 30 Total Reward: 280.0
Episode 35 Total Reward: 520.0
Episode 40 Total Reward: 430.0
Episode 45 Total Reward: 330.0
Episode 50 Total Reward: 450.0
Episode 55 Total Reward: 1110.0
Episode 60 Total Reward: 650.0
Episode 65 Total Reward: 550.0
Episode 70 Total Reward: 900.0
Episode 75 Total Reward: 310.0
Episode 80 Total Reward: 850.0
Episode 85 Total Reward: 230.0
Episode 90 Total Reward: 460.0
Episode 95 Total Reward: 810.0
Episode 100 Total Reward: 980.0


In [34]:
env.close()
show_video()