In [1]:
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import time
from matplotlib import pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3 import DQN
from gym import Env
from gym.spaces import MultiBinary, Box
import numpy as np
from gym.wrappers import GrayScaleObservation
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.vec_env import VecFrameStack
import os
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.env_util import make_vec_env
import gym
from stable_baselines3.common.atari_wrappers import AtariWrapper
from gym import Wrapper
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage

In [2]:
# Create custom environment
class Mario(Wrapper):
    def __init__(self, env):
        super(Mario, self).__init__(env)
        self._current_score = 0

    def step(self, action):
        state, reward, done, info = self.env.step(action)
        reward += (info['score'] - self._current_score) / 40.0
        self._current_score = info['score']
        if done:
            if info['flag_get']:
                print('We got it!!!!!')
                reward += 350.0
            else:
                reward -= 50.0
        return state, reward / 10.0, done, info

    def reset(self):
        """Reset the environment and return the initial observation."""
        return self.env.reset()

    def render(self, *args, **kwargs):
        self.env.render()

    def close(self):
        self.env.close()

In [3]:
monitor_dir = './DQN_Monitor_Logs/'
os.makedirs(monitor_dir,exist_ok=True)

In [3]:
def mario_wrapper(env):
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    env = AtariWrapper(env, terminal_on_life_loss=False, clip_reward=False)
    env = Mario(env)
    return env

In [5]:
env = make_vec_env('SuperMarioBros-v0', seed=3994448089, wrapper_class=mario_wrapper)

env = VecFrameStack(env,4,channels_order='last')
env = VecTransposeImage(env)

In [6]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    def __init__(self, check_freq, save_model_dir, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = os.path.join(save_model_dir, 'best_model/')
        self.best_mean_reward = -np.inf

    # def _init_callback(self) -> None:
    def _init_callback(self):
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    # def _on_step(self) -> bool:
    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            print('self.n_calls: ',self.n_calls)
            model_path1 = os.path.join(self.save_path, 'model_{}'.format(self.n_calls))
            self.model.save(model_path1)

        return True

In [7]:
save_model_dir = './DQN_train_1/'
callback1 = SaveOnBestTrainingRewardCallback(10000, save_model_dir)

In [8]:
model_param_1={ 'gamma': 0.8692871366327747,'learning_rate': 6.442559213980066e-05 }

In [9]:
tensorboard_log = r'./DQN_tensorboard_log/'

model = DQN("CnnPolicy", env, verbose=1,
            tensorboard_log = tensorboard_log,**model_param_1)
model.learn(total_timesteps=8000000,callback=callback1)
model.save("mario_model_DQN")

Using cuda device
Logging to ./DQN_tensorboard_log/DQN_1


  return (self.ram[0x86] - self.ram[0x071c]) % 256


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.22e+03 |
|    ep_rew_mean      | 1.56e+03 |
|    exploration_rate | 0.996    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 108      |
|    time_elapsed     | 29       |
|    total_timesteps  | 3206     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.97e+03 |
|    ep_rew_mean      | 1.62e+03 |
|    exploration_rate | 0.991    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 115      |
|    time_elapsed     | 68       |
|    total_timesteps  | 7913     |
----------------------------------
self.n_calls:  10000
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.84e+03 |
|    ep_rew_mean      | 1.6e+03  |
|    exploration_rate | 0.986    |
| time/               |          |

KeyboardInterrupt: 

In [4]:
env = make_vec_env('SuperMarioBros-1-1-v0', n_envs=1, seed=3994448089, wrapper_class=mario_wrapper)
# env = make_vec_env('SuperMarioBros-1-4-v0', n_envs=1, seed=3994448089, wrapper_class=mario_wrapper)
env = VecFrameStack(env,4,channels_order='last')
env = VecTransposeImage(env)

In [5]:
model = PPO.load(r'./1_NewExperiment/1_NewExperiment/PPO/PPO_model_DF_0.99/best_model/model_1000000')


obs = env.reset()
obs=obs.copy()
done = True
while True:
    if done:
        state = env.reset()
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    obs=obs.copy()
    env.render()
    time.sleep(0.01)

  return (self.ram[0x86] - self.ram[0x071c]) % 256


We got it!!!!!


KeyboardInterrupt: 

In [6]:
env.close()