# Libraries

In [6]:
import numpy as np 
import tensorflow as tf 
import gym 
from gym import wrappers
import pygame 
from gym.utils import play # manual play 
import matplotlib.pyplot as plt 
import seaborn as sns 
import io
import base64
from IPython.display import HTML
import random
from IPython.display import clear_output

# Constants 

In [7]:
ATARI_GAME = "BreakoutDeterministic-v4" # Frame skip set to 4 default 
NUM_OF_EPISODES = 10 

# Open AI Gym Environment Setup

In [8]:
env = gym.make(ATARI_GAME)
env = wrappers.Monitor(env, "./gym-results", force=True) # Cache game

# # Uncomment line below to play the game as a human
# #play.play(env, zoom=3)

# Let the environment play, by taking random actions at each timestep
for episode in range(NUM_OF_EPISODES): 
    env.reset()
    while True: 
        random_action = env.action_space.sample() 
        observation, reward, is_done, info = env.step(random_action)
        if is_done:
            break 
env.close() 

# Display offline mp4 simulation
Source: https://kyso.io/eoin/openai-gym-jupyter?utm_campaign=News&utm_medium=Community&utm_source=DataCamp.com

In [9]:
video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))

# Q Learning Example 

In [None]:
env = gym.make('FrozenLake-v0') 

actions = { "LEFT": 0, "DOWN": 1, "RIGHT": 2, "UP": 3 } 
action_space_size = env.action_space.n
state_space_size = env.observation_space.n # positions 
print(f"Action space: {action_space_size}\n State space: {state_space_size}\n Actions: {actions}")

q_table = np.zeros((state_space_size, action_space_size))

# Hyper parameters 
NUM_EPISODES = 10000 # Games played 
MAX_STEPS_PER_EPISODE = 100 
LEARNING_RATE = 0.1 
DISCOUNT_RATE = 0.9

EXPLORATION_RATE = 1
MAX_EXPLORATION_RATE = 1 
MIN_EXPLORATION_RATE = 0.01 
EXPLORATION_DECAY_RATE = 0.01 
rewards_all_episodes = []

# Q learning algorithm
for episode in range(NUM_EPISODES): 
    state = env.reset() 
    is_done = False 
    rewards_current_episode = 0 
    
    for step in range(MAX_STEPS_PER_EPISODE): 
        
        # Explore exploit trade-off to pick action 
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > EXPLORATION_RATE: 
            action = np.argmax(q_table[state, :]) # exploit 
        else: 
            action = env.action_space.sample() # explore 
            
        new_state, reward, is_done, _ = env.step(action)
        
        # Q Update: Q_new(s,a) = (1 - alpha) * Q_old (s, a) + alpha * [r + y * max_a' Q(s', a')]
        q_table[state, action] = (1 - LEARNING_RATE) * q_table[state, action] + \
            LEARNING_RATE * (reward + DISCOUNT_RATE * np.max(q_table[new_state, :]) )
        
        state = new_state 
        rewards_current_episode += reward 
        
        if is_done == True:
            break 
    
    EXPLORATION_RATE = MIN_EXPLORATION_RATE + (MAX_EXPLORATION_RATE - MIN_EXPLORATION_RATE) * np.exp(-EXPLORATION_DECAY_RATE * episode)
    rewards_all_episodes.append(rewards_current_episode)
env.close() 

# # Calculate and print average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), NUM_EPISODES / 1000)
count = 1000
print("***** Average reward per thousand episodes ***** \n")
for r in rewards_per_thousand_episodes:
    print(f"{count}: {str(sum(r/10000))}")
    count += 1000

# # Print updated Q-table
print("\n\n ***** Q-table \n")
print(q_table)

Action space: 4
 State space: 16
 Actions: {'LEFT': 0, 'DOWN': 1, 'RIGHT': 2, 'UP': 3}
