In [1]:
#Install necessary files for the game
!pip install gym_super_mario_bros==7.3.0 nes_py
!pip install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio===0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
!pip install stable-baselines3[extra]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym_super_mario_bros==7.3.0
  Downloading gym_super_mario_bros-7.3.0-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 5.3 MB/s 
[?25hCollecting nes_py
  Downloading nes_py-8.2.1.tar.gz (77 kB)
[K     |████████████████████████████████| 77 kB 6.0 MB/s 
Collecting pyglet<=1.5.21,>=1.4.0
  Downloading pyglet-1.5.21-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 45.9 MB/s 
Building wheels for collected packages: nes-py
  Building wheel for nes-py (setup.py) ... [?25l[?25hdone
  Created wheel for nes-py: filename=nes_py-8.2.1-cp37-cp37m-linux_x86_64.whl size=432690 sha256=f1448e4cf056b761297259234b10d473c335df90d5f49ac180ad328e4aa32fd5
  Stored in directory: /root/.cache/pip/wheels/17/96/0e/22a8c7dbdf412d8e988286f223b223baf0f4ad90c9e699c56d
Successfully built nes-py
Installing collected packages: pyglet, nes-py, gym-supe

In [2]:
#Import libraries
import io
import os 
import base64
import gym_super_mario_bros

from IPython.display import HTML
from stable_baselines3 import PPO
from matplotlib import pyplot as plt
from nes_py.wrappers import JoypadSpace
from gym.wrappers import GrayScaleObservation
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv

In [3]:
#Setting up the game
envi = gym_super_mario_bros.make('SuperMarioBros-v3')
envi = JoypadSpace(envi, SIMPLE_MOVEMENT)

envi = GrayScaleObservation(envi, keep_dim=True)
envi = DummyVecEnv([lambda: envi])
envi = VecFrameStack(envi, 4, channels_order='last')

state = envi.reset()
state, reward, done, info = envi.step([5])

In [4]:
#Game callbacks to save model after specific steps
class TrainAndLoggingCallback(BaseCallback):
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)
        return True

In [9]:
#Directory for saved outputs
CHECKPOINT_DIR = 'checkpoint'
LOG_DIR = 'logs'

In [10]:
#Gives a set parameter for number of steps
cb = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

In [11]:
#Model development
model = PPO('CnnPolicy', envi, verbose=1, tensorboard_log=LOG_DIR, learning_rate=0.000001, 
            n_steps=512) 

Using cpu device
Wrapping the env in a VecTransposeImage.


In [12]:
#Data training
model.learn(total_timesteps=50000, callback=cb)

Logging to logs/PPO_1


  return (self.ram[0x86] - self.ram[0x071c]) % 256


----------------------------
| time/              |     |
|    fps             | 20  |
|    iterations      | 1   |
|    time_elapsed    | 25  |
|    total_timesteps | 512 |
----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 6             |
|    iterations           | 2             |
|    time_elapsed         | 150           |
|    total_timesteps      | 1024          |
| train/                  |               |
|    approx_kl            | 1.3627927e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.95         |
|    explained_variance   | 0.000619      |
|    learning_rate        | 1e-06         |
|    loss                 | 111           |
|    n_updates            | 10            |
|    policy_gradient_loss | -0.000111     |
|    value_loss           | 304           |
-------------------------------------------
-----

<stable_baselines3.ppo.ppo.PPO at 0x7fed11973710>

In [13]:
#Give the save file a name
model.save('Cuadra_Mario_Model')

In [14]:
#Reset game
state = envi.reset()

In [15]:
#Adds functionality that can be feed to the agent
from gym import wrappers
envi = gym_super_mario_bros.make('SuperMarioBros2-v0')
envi = wrappers.Monitor(envi, "./gym-results", force=True)
envi.reset()
for _ in range(4000):
    action = envi.action_space.sample()
    state, reward, done, info = envi.step(action)
    if done: break
envi.close()

  return (self.ram[0x86] - self.ram[0x071c]) % 256


In [18]:
#Video
video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % envi.file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))