#Brute Force Agent
This is a agent that utlized the brute force algorithm provided by Gym Retro examples.

The brute force algorithm relies on the determinism of the environment in order to brute force the best sequence of actions. It does not save emulator state but saves the action sequence that led to the best outcome.

#Mount Google Drive

In [0]:
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    %tensorflow_version 2.x
    COLAB = True
    print("Note: using Google CoLab")
except:
    print("Note: not using Google CoLab")
    COLAB = False

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
TensorFlow 2.x selected.
Note: using Google CoLab


#Install Retro & Imports

In [0]:
!pip3 install gym-retro # install OpenAI Gym-Retro

Collecting gym-retro
[?25l  Downloading https://files.pythonhosted.org/packages/73/6d/2c9f009663b74bcf66a2306c2b8a819a1ac6b0d3090e342720291b527446/gym_retro-0.7.1-cp36-cp36m-manylinux1_x86_64.whl (162.0MB)
[K     |████████████████████████████████| 162.0MB 57kB/s 
Installing collected packages: gym-retro
Successfully installed gym-retro-0.7.1


In [0]:
import gym
import retro
import numpy as np
import matplotlib.pyplot as plt
import imageio
import cv2
import os
from datetime import datetime, timedelta
from time import time

#Install Roms

In [0]:
# install roms
romPath = '/content/drive/My Drive/Colab Notebooks/Roms/'
for rom in os.listdir(romPath):
  retro.data.merge(romPath + rom, quiet=False)

Importing SpaceInvaders-Nes
Imported 1 games
Importing Joust-Nes
Imported 1 games
Importing SuperMarioBros-Nes
Imported 1 games
Importing MsPacMan-Nes
Imported 1 games


#Environment Wrappers

In [0]:
# Discretize continuous action space
class Discretizer(gym.ActionWrapper):
  def __init__(self, env, combos):
    super().__init__(env)
    assert isinstance(env.action_space, gym.spaces.MultiBinary)
    buttons = env.unwrapped.buttons
    self._decode_discrete_action = []
    for combo in combos:
      arr = np.array([False] * env.action_space.n)
      for button in combo:
        arr[buttons.index(button)] = True
      self._decode_discrete_action.append(arr)

    self.action_space = gym.spaces.Discrete(len(self._decode_discrete_action))

  def action(self, act):
    return self._decode_discrete_action[act].copy()

# Limit the episode length
class TimeLimit(gym.Wrapper):
  def __init__(self, env, max_episode_steps=None):
    super().__init__(env)
    self._max_episode_steps = max_episode_steps
    self._elapsed_steps = 0

  def step(self, action):
    obs, reward, done, info = self.env.step(action)
    self._elapsed_steps += 1
    if self._elapsed_steps >= self._max_episode_steps:
      done = True
      info['TimeLimit.truncated'] = True
    return obs, reward, done, info

  def reset(self, **kwargs):
    self._elapsed_steps = 0
    return self.env.reset(**kwargs)

# Skip frames
class SkipFrames(gym.Wrapper):
  def __init__(self, env, n = 4):
    gym.Wrapper.__init__(self, env)
    self.n = n

  def step(self, action):
    done = False
    totalReward = 0.0
    for _ in range(self.n):
      obs, reward, done, info = self.env.step(action)
      totalReward += reward
      if done:
        break
    return obs, totalReward, done, info

# combine the wrappers
def make_env(game, state, scenario, record, obs_type, combos, timeLimit):
  env = retro.make(game=game, state=state, scenario=scenario, record=record, obs_type=obs_type)
  env = Discretizer(env, combos=combos)
  env = TimeLimit(env, timeLimit)
  env = SkipFrames(env)
  return env

#Utility functions

In [0]:
# Render avi or gif
def renderFrames(frame_array, savePath, fileName, fps, otype='AVI'):
  print('Converting frames to .{} ...'.format(otype), end=' ')
  if otype == 'AVI':
    fileName += '.avi'
    height, width, layers = frame_array[0].shape
    if layers == 1:
      layers = 0
    size = (width, height)
    out = cv2.VideoWriter(savePath + fileName, cv2.VideoWriter_fourcc(*'DIVX'), fps, size, layers)
    for i in range(len(frame_array)):
      out.write(frame_array[i])
    out.release()
    print('Done. Saved to {}'.format(savePath + fileName))
  elif otype == 'GIF':
    fileName += '.gif'
    imageio.mimwrite(savePath + fileName, replay, fps=fps)
    print('Done. Saved to {}'.format(savePath + fileName))
  else:
    print('Error: Invalid type, must be GIF or AVI.')

# Get current date/time foramtted as string
def Now():
  # subtract 6 hours to get correct tz
  now = datetime.now() - timedelta(hours=6)
  return now.strftime('%m_%d_%Y_%H%M%S')

# save the log to file
def saveLog(savePath, fileName, log):
  print('Saving log ...', end=' ')
  fileName += '.log'
  f = open(savePath + fileName, 'w')
  for line in log:
    f.write(line + '\n')
  f.write(Now())
  f.close()
  print('Done. Saved to {}'.format(savePath + fileName))

#Agent
Brute force agent modeled after code found here: [Brute Algorithm](https://github.com/openai/retro/blob/master/retro/examples/brute.py)

In [0]:
class Node:
  def __init__(self, value=-np.inf, children=None):
    self.value = value
    self.visits = 0
    self.children = {} if children is None else children

  def __repr__(self):
    return '<Node value={%f} visits={%d} len(children)={%d}>'.format(
      self.value,
      self.visits,
      len(self.children) )
    
class Results:
  def __init__(self):
    self.episode = None
    self.score = -np.inf
    self.actions = []
    self.frames = []
    self.log = []

class Agent:
  def __init__(self, env, max_episode_steps):
    self.node_count = 1
    self._root = Node()
    self._env = env
    self._max_episode_steps = max_episode_steps
    self._EXPLORATION_PARAM = 0.005
    self.results = Results() # store the results of the best eipsode

  def _select_actions(self):
    """
    Select actions from the tree.
    Select the greedy action that has the highest reward for with subtree. 
    Small chance to select a random action based on exploration param and
    visit count of current node at each step.
    We select actions for the longest possible episode, but normally these
    will not all be used.  They will instead be truncated to the length
    of the actual episode and then used to update the tree.
    """
    node = self._root
    action_space = self._env.action_space
    actions = []
    steps = 0
    while steps < self._max_episode_steps:
      if node is None:
        # fallen off the explored area of tree, just select random actions
        action = action_space.sample()
      else:
        epsilon = self._EXPLORATION_PARAM / np.log(node.visits + 2)
        if np.random.random() < epsilon: # random action
          action = action_space.sample()
        else: # greedy action
          action_value = {}
          for action in range(action_space.n):
            if node is not None and action in node.children:
              action_value[action] = node.children[action].value
            else:
              action_value[action] = -np.inf
          best_value = max(action_value.values())
          best_actions = [
            action for action, value in action_value.items() if value == best_value
          ]
          action = np.random.choice(best_actions)

        if action in node.children:
          node = node.children[action]
        else:
          node = None
      actions.append(action)
      steps += 1
    return actions

  def _rollout(self, actions):
    """
    Perform a rollout using a preset collection of actions.
    """
    frames = []
    totalReward = 0
    self._env.reset()
    steps = 0
    for action in actions:
      obs, reward, done, _info = self._env.step(action)
      steps += 1
      totalReward += reward
      frames.append(self._env.render(mode='rgb_array'))
      if done:
        break
    return steps, totalReward, frames

  def _update_tree(self, executed_actions, totalReward):
    """
    Given the tree, a list of actions that were executed before the game ended,
    and a reward, update the tree so that the path formed by the executed
    actions are all updated to the new reward.
    """
    self._root.value = max(totalReward, self._root.value)
    self._root.visits += 1
    new_nodes = 0
    node = self._root
    for step, action in enumerate(executed_actions):
      if action not in node.children:
        node.children[action] = Node()
        new_nodes += 1
      node = node.children[action]
      node.value = max(totalReward, node.value)
      node.visits += 1
    return new_nodes

  def learn(self, numEpisodes = 1, verbose = 2):
    sTime = time()
    for i in range(numEpisodes):
      actions = self._select_actions()
      steps, totalReward, frames = self._rollout(actions)
      executed_actions = actions[:steps]
      self.node_count += self._update_tree(executed_actions, totalReward)
      logStr = 'Episode {:d}: actions={:d}, score={}, nodes={:d}'.format(
          i+1, len(executed_actions), totalReward, self.node_count)
      self.results.log.append(logStr)
      if verbose > 1:
        print(logStr)
      if totalReward > self.results.score:
        self.results.score = totalReward
        self.results.episode = i+1
        self.results.actions = executed_actions
        self.results.frames = frames
    eTime = round(time() - sTime, 4)
    logStr1 = '{} episodes ran and took {} seconds'.format(
        numEpisodes, eTime)
    logStr2 = 'Episode {:d} had the best score of {}.'.format(
        self.results.episode, self.results.score)
    self.results.log.append(logStr1)
    self.results.log.append(logStr2)
    if verbose > 0:
      if verbose > 1:
        print('-'*80)
      print(logStr1); print(logStr2);

#Setup

In [0]:
class Args:
  game = ['SpaceInvaders-Nes', 'Joust-Nes', 'SuperMarioBros-Nes', 'MsPacMan-Nes']
  combos = [ [['LEFT'], ['RIGHT'], ['A'], ['LEFT', 'A'], ['RIGHT', 'A']],
             [['LEFT'], ['RIGHT'], ['UP'], ['DOWN']] ]
  state = retro.State.DEFAULT
  scenario = 'scenario'
  record = False
  obs_type = retro.Observations.IMAGE
  savePath = '/content/drive/My Drive/Colab Notebooks/'

args = Args()

#Train

In [0]:
# you don't acually need this but usefull for debugging if env() was 
# not closed probably because bug
try:
  if env:
    env.close()
except NameError:
  pass

# 0: Space Invaders (NES) 1: Joust (NES) 2: Super Mario Bros (SNES) 3: Ms. Pacman
# 0: [LT, RT, A] 1: [LT, RT, UP, DN]
gameIndex, comboIndex, numEpisodes, maxSteps = 3, 1, 250, 10000

env = make_env(args.game[gameIndex], args.state, args.scenario, args.record,
               args.obs_type, args.combos[comboIndex], maxSteps)

agent = Agent(env, maxSteps)
agent.learn(numEpisodes=numEpisodes)

env.close()

fileName = 'Brute_' + Args.game[gameIndex] + '_E{}_{}'.format(
    agent.results.episode, Now())
renderFrames(agent.results.frames, args.savePath, fileName, 60, otype='AVI')
fileName = 'Brute_' + Args.game[gameIndex]
saveLog(args.savePath, fileName, agent.results.log)

Episode 1: actions=1007, score=300.0, nodes=1008
Episode 2: actions=868, score=320.0, nodes=1415
Episode 3: actions=772, score=310.0, nodes=1985
Episode 4: actions=868, score=320.0, nodes=1985
Episode 5: actions=979, score=460.0, nodes=2391
Episode 6: actions=986, score=460.0, nodes=2640
Episode 7: actions=936, score=370.0, nodes=3425
Episode 8: actions=778, score=160.0, nodes=4120
Episode 9: actions=926, score=550.0, nodes=4901
Episode 10: actions=729, score=440.0, nodes=5321
Episode 11: actions=926, score=550.0, nodes=5349
Episode 12: actions=795, score=300.0, nodes=6022
Episode 13: actions=963, score=420.0, nodes=6971
Episode 14: actions=805, score=460.0, nodes=7409
Episode 15: actions=758, score=400.0, nodes=8165
Episode 16: actions=806, score=440.0, nodes=8688
Episode 17: actions=850, score=440.0, nodes=9228
Episode 18: actions=735, score=280.0, nodes=9903
Episode 19: actions=820, score=300.0, nodes=10596
Episode 20: actions=838, score=450.0, nodes=11058
Episode 21: actions=851, s