# Dependencies 

In [1]:
## Setup
# import environment
import gym_super_mario_bros
# import joypad
from nes_py.wrappers import JoypadSpace
# import controls
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

## Preprocessing
from gym.wrappers import GrayScaleObservation
import gym

from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv
from matplotlib import pyplot as plt

## Agent training
import os
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
# from stable_baselines.common import set_global_seeds
from stable_baselines3.common.atari_wrappers import MaxAndSkipEnv, WarpFrame
# from stable_baselines.common.policies import MlpPolicy
# callback
import numpy as np
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common import results_plotter
from stable_baselines3.common.monitor import Monitor

import time 

In [2]:
SIMPLE_MOVEMENT
CHECKPOINT_PATH = "./checkpoints_tutorialcpy2"
LOG_PATH = "./logs"

In [3]:
env = gym_super_mario_bros.make('SuperMarioBros-1-1-v3')
env.action_space

Discrete(256)

In [4]:
env = Monitor(env, LOG_PATH)

In [5]:
# wrap environment with controls
env = JoypadSpace(env, SIMPLE_MOVEMENT)
env.action_space


Discrete(7)

In [6]:
# frame skip
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        """Return only every `skip`-th frame"""
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        """Repeat action, and sum reward"""
        total_reward = 0.0
        for i in range(self._skip):
            # Accumulate reward and repeat the same action
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info

In [7]:
env = SkipFrame(env, skip=4)

In [8]:
# Grayscale
print("Input shape before grayscale: ", env.observation_space.shape)
env = WarpFrame(env, width=120, height=128) # gray scale and 84x84 # gray scale and 84x84
print("Input shape after grayscale: ", env.observation_space.shape)


Input shape before grayscale:  (240, 256, 3)
Input shape after grayscale:  (128, 120, 1)


In [9]:
# Wrap the environment with the wrapper
env = DummyVecEnv([lambda: env])


In [10]:
# FrameStack
env = VecFrameStack(env, 4, channels_order='last')

# Testing the agent

#

In [11]:
#TODO - add early stopping callback
class TrainLoggingCallback(BaseCallback): 
    """
    Callback for saving a model every ``freq`` steps.
    :param freq: (int)  
    :param path: (str) Path to the folder where the model will be saved.
    :param verbose: (int) Verbosity level 0: not output 1: info 2: debug

    """
    def __init__(self, freq, path, verbose=1):
        super(TrainLoggingCallback, self).__init__(verbose)
        self.freq = freq
        self.path = path
    
    def _init_callback(self) -> None:
        if self.path is not None:
            os.makedirs(self.path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.freq == 0:
            self.model.save(os.path.join(self.path, f"model_{self.n_calls}"))
        return True
    
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    From stable-baselines3 example

    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq:
    :param log_dir: Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: Verbosity level: 0 for no output, 1 for info messages, 2 for debug messages
    """
    def __init__(self, check_freq: int, log_dir: str, verbose: int = 1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, "best_model")
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % 500000 == 0:
            self.model.save(os.path.join(self.save_path, f"model_{self.n_calls}"))

        if self.n_calls % self.check_freq == 0:

          # Retrieve training reward
          x, y = ts2xy(load_results(self.log_dir), "timesteps")
          if len(x) > 0:
              # Mean training reward over the last 100 episodes
              mean_reward = np.mean(y[-100:])
              if self.verbose >= 1:
                print(f"Num timesteps: {self.num_timesteps}")
                print(f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}")

              # New best model, you could save the agent here
              if mean_reward > self.best_mean_reward:
                  self.best_mean_reward = mean_reward
                  # Example for saving best model
                  if self.verbose >= 1:
                    print(f"Saving new best model to {self.save_path}")
                  self.model.save(self.save_path)

        return True

In [12]:
callback = SaveOnBestTrainingRewardCallback(1000, LOG_PATH)

In [13]:
model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=LOG_PATH, n_steps=512, learning_rate=1e-6, ent_coef=0.01) #TODO change to CnnPolicylstm policy


Using cuda device
Wrapping the env in a VecTransposeImage.


In [None]:
# load the agent
CHECKPOINT_PATH = "checkpoints_tutorialcpy1e-5-1024-skip2"
LOG_PATH = "./logs" 
model = PPO.load(CHECKPOINT_PATH +"/best_model")

In [15]:
state = env.reset()
while True:
    time.sleep(1/26)
    action, _ = model.predict(state)
    state, reward, done, info = env.step(action)
    env.render()

## Make a GIF

In [43]:

import imageio as iio
import numpy as np

images = []
state = env.reset()
img = model.env.render(mode="rgb_array")
for i in range(350):
    action, _ = model.predict(state)
    state, _, _ ,_ = model.env.step(action)
    img = model.env.render(mode="rgb_array")
    images.append(img)

# save all images as jpg
for i in range(len(images)):
    iio.imwrite("jpg/mario"+str(i)+".jpg", images[i])



iio.mimsave("mario.gif",images, duration=1/26)
