# Dependencies 

In [22]:
## Setup
# import environment
import gym_super_mario_bros
# import joypad
from nes_py.wrappers import JoypadSpace
# import controls
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

## Preprocessing
from gym.wrappers import GrayScaleObservation
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv, SubprocVecEnv
from matplotlib import pyplot as plt

## Agent training
import os
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
# from stable_baselines.common import set_global_seeds
from stable_baselines3.common.atari_wrappers import MaxAndSkipEnv
# from stable_baselines.common.policies import MlpPolicy
# callback
import numpy as np
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common import results_plotter
from stable_baselines3.common.monitor import Monitor

#### Agent actions
These are the possible actions that the agent can take in the environment.

In [23]:
SIMPLE_MOVEMENT
CHECKPOINT_PATH = "./checkpoints_tutorialcpy2"
LOG_PATH = "./logs"

In [24]:

os.makedirs(LOG_PATH, exist_ok=True)

# Setup environment
For this project we well be making use if the gym-super-mario-bros enviroment.

In [25]:
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env.action_space

Discrete(256)

In [26]:
env = Monitor(env, LOG_PATH)

Simplify the action space to limit the number of possible actions. This will make it easier for the agent to learn the optimal policy as there will be less possible actions to choose from.

In [27]:
# wrap environment with controls
env = JoypadSpace(env, [["right"], ["right", "A"]])
env.action_space

Discrete(2)

The observation space serves as the input to the agent. The agent will use this information to learn the optimal policy.
In our case the observation space is a frame from the game.

In [28]:
# env.observation_space.shape

Test the environment with random actions.

In [29]:
# env_demo = env

# done = True # reset the env
# # loop X steps
# for step in range(200):
#     if done:
#         # start the env
#         env_demo.reset()
#     # pass a random action to the env
#     state, reward, done, info = env_demo.step(env_demo.action_space.sample()) 
#     # render the env
#     env_demo.render()
# env_demo.close()


Each step that the agent takes in the enviroment will return a new state. This state is the observation space for the next step. In our case an image of the game:

In [30]:
# state.shape
# # plt.imshow(state)

The reward function form the enviroment assumes that the objective is to move as far to the right as possible, as fast as possible and without dying. The value of the reward reflects this. More on how this reward is calculated can be found here, in the gym-super-mario-bros documentation. https://pypi.org/project/gym-super-mario-bros/

In [31]:
# reward

To check wether or not the game is running we use the done variable

In [32]:
# done

Miscellaneous information about the environment is found in the info variable.

In [33]:
# info

# Preprocessing Environment
To make use of the data that the environment returns we need to preprocess it. The two steps we will take are:
1. Convert the image to grayscale - This will reduce the size of the observation space and make it easier for the agent to learn the optimal policy.
2. Frame stacking - This gives the agent a sense of motion and context and helps it understand the dynamics of the game.

### Wrap the environment:

In [34]:
# Grayscale
print("Input shape before grayscale: ", env.observation_space.shape)
env = GrayScaleObservation(env, keep_dim=True)
print("Input shape after grayscale: ", env.observation_space.shape)


Input shape before grayscale:  (240, 256, 3)
Input shape after grayscale:  (240, 256, 1)


In [35]:
# Wrap the environment with the wrapper
env = DummyVecEnv([lambda: env])


In [36]:
# skip 4 frames
env = MaxAndSkipEnv(env, skip=4)

In [37]:
# FrameStack
env = VecFrameStack(env, 4, channels_order='last')

# Training the agent


In order to save the model we will use a callback function in order to keep good pratices and avoid losing the model in case of a crash or any other problem.

In [38]:
#TODO - add early stopping callback
class TrainLoggingCallback(BaseCallback): 
    """
    Callback for saving a model every ``freq`` steps.
    :param freq: (int)  
    :param path: (str) Path to the folder where the model will be saved.
    :param verbose: (int) Verbosity level 0: not output 1: info 2: debug

    """
    def __init__(self, freq, path, verbose=1):
        super(TrainLoggingCallback, self).__init__(verbose)
        self.freq = freq
        self.path = path
    
    def _init_callback(self) -> None:
        if self.path is not None:
            os.makedirs(self.path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.freq == 0:
            self.model.save(os.path.join(self.path, f"model_{self.n_calls}"))
        return True
    
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    From stable-baselines3 example

    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq:
    :param log_dir: Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: Verbosity level: 0 for no output, 1 for info messages, 2 for debug messages
    """
    def __init__(self, check_freq: int, log_dir: str, verbose: int = 1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, "best_model")
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

          # Retrieve training reward
          x, y = ts2xy(load_results(self.log_dir), "timesteps")
          if len(x) > 0:
              # Mean training reward over the last 100 episodes
              mean_reward = np.mean(y[-100:])
              if self.verbose >= 1:
                print(f"Num timesteps: {self.num_timesteps}")
                print(f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}")

              # New best model, you could save the agent here
              if mean_reward > self.best_mean_reward:
                  self.best_mean_reward = mean_reward
                  # Example for saving best model
                  if self.verbose >= 1:
                    print(f"Saving new best model to {self.save_path}")
                  self.model.save(self.save_path)

        return True

In [39]:
callback = SaveOnBestTrainingRewardCallback(1000, LOG_PATH)

1. Learing rate corresponds to how fast we want our model to learn, if the learning rate is too high the model will not learn anything, if it is too low it will take too long to learn. The default value is 0.00001.
2. N_steps is the number of steps that the agent will take in the environment before updating the weights of the model. The default value is 512 for our case.

In [40]:
lr = 2e-4
n_steps = 512

The agent will be trained using the Proximal Policy Optimization (PPO) algorithm.

In [41]:
model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=LOG_PATH, n_steps=n_steps, learning_rate=lr) #TODO change to CnnPolicylstm policy


Using cuda device
Wrapping the env in a VecTransposeImage.


In [42]:
# # load the agent
# model = PPO.load(CHECKPOINT_PATH + "/model_70000")

In [43]:
# Train the agent

model.learn(total_timesteps= 5000000, callback=callback)

Logging to ./logs\PPO_21
----------------------------
| time/              |     |
|    fps             | 255 |
|    iterations      | 1   |
|    time_elapsed    | 2   |
|    total_timesteps | 512 |
----------------------------


IndexError: string index out of range

# Testing the agent

In [None]:
# load the agent
model = PPO.load(CHECKPOINT_PATH + "/model_1000000")

In [None]:
state = env.reset()
while True:
    action, _ = model.predict(state)
    state, reward, done, info = env.step(action)
    env.render()

#