### 1 - SETUP
Install dependencies and import all necessary modules

In [None]:
#install dependencies
#specify version because latest version returns extra unwanted output
!pip install gym-super-mario-bros==7.4.0 gym==0.25.2 nes_py
!pip install tensordict torchrl
!apt-get install freeglut3 freeglut3-dev mesa-common-dev
!pip install gym pyvirtualdisplay
!apt-get install -y xvfb x11-utils
!pip install stable-baselines3
!pip install 'shimmy>=0.2.1'

#ref https://github.com/yfeng997/MadMario

In [59]:
'''Import all modules'''

#Import modules and methods to view and interact with the game
import gym
import gym_super_mario_bros # Gym is an OpenAI toolkit for RL
from gym.spaces import Box
from gym.wrappers import FrameStack, RecordEpisodeStatistics, RecordVideo, GrayScaleObservation, ResizeObservation
from nes_py.wrappers import JoypadSpace # NES Emulator for OpenAI Gym
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT # simplified controls
from pyvirtualdisplay import Display #To display the game

#Import stablebaselines, a module with superior performance to openai
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv
from stable_baselines3 import PPO
# Import Base Callback for saving models
from stable_baselines3.common.callbacks import BaseCallback

#Import machine learning and neural network modules
import torch
from torch import nn
from torchvision import transforms as T
import numpy as np
from tensordict import TensorDict
from torchrl.data import TensorDictReplayBuffer, LazyMemmapStorage

#Import other miscellaneous modules
from PIL import Image
import random, datetime, os, copy
from pathlib import Path
from collections import deque
import os
import time, datetime
import matplotlib.pyplot as plt

#Eliminate warning messages
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

#### 1.1 - DEFINITIONS:

**Agent:** The attribute within the environment which interacts with it.

**Environment:** The world that an agent interacts with and learns from.

**Action** $a$: How the Agent responds within the Environment. The
set of all possible Actions is called *action-space*.

**State** $s$ : The agent's current position within Environment. The
set of all possible States the Environment can be in is called
*state-space (S)* where: $$s \in S$$

**Reward** $r$ : Reward is the feedback that the Agent receives from the Environment resulting from a particular action. An aggregation of rewards over multiple time steps is called
**Return**, which is often discounted and denoted by: $$G_t = R_{t+1} + \gamma R_{t+2} + \gamma ^ 2 R_{t+3} + ... $$

**Policy ($\pi$):** The 'ruleset' followed by the Agent in its quest to find the optimal reward.

**Optimal Action-Value function** $Q^*(s,a)$ : Gives the expected
return if you start in state $s$, take an arbitrary action
$a$, and then for each future time step take the action that
maximizes returns under the optimal policy ($\pi^*$). $Q$ can be said to stand for the “quality” of
the action in a state. We try to approximate this function.

#### 1.2 - SETUP THE ENVIRONMENT
In Mario, the environment consists of interactable tubes, mushrooms, coins and enemies.

When the agent (Mario) makes an action, we retrieve the changed (next) state, reward, boolean game status (done or not done), .


In [53]:
# Setup game, using downsampled standard (v1)
# from https://pypi.org/project/gym-super-mario-bros/

# Initialize Super Mario environment (in v0.26 change render mode to 'human' to see results on the screen)
if gym.__version__ < '0.26':
    env = gym_super_mario_bros.make("SuperMarioBros-v1", new_step_api=True)
else:
    env = gym_super_mario_bros.make("SuperMarioBros-v1", render_mode='rgb', apply_api_compatibility=True)

# Limit the action-space to
#   0. walk right
#   1. jump right
#   legal moves: https://github.com/Kautenja/gym-super-mario-bros/blob/master/gym_super_mario_bros/actions.py

LIMITED_ACTIONS = [["right"], ["right", "A"]]
env = JoypadSpace(env, LIMITED_ACTIONS)

env.reset()
next_state, reward, done, trunc, info = env.step(action=0)
print(f"Current reward: {reward}\n Is the process done? {done}, because Time: {info['time']}\nand Flag reached? ... is {info['flag_get']} \n Coins: {info['coins']}\n Score: {info['score']}\n")
print(f"Current position in (x,y) coordinates: ({info['x_pos']}, {info['y_pos']})")
print(f"Currently in game stage {info['stage']}")

Current reward: 0.0
 Is the process done? False, because Time: 400
and Flag reached? ... is False 
 Coins: 0
 Score: 0

Current position in (x,y) coordinates: (40, 79)
Currently in game stage 1


In [54]:
print(f'The game is played on a {env.observation_space.shape[0]} by {env.observation_space.shape[1]} pixel space, with {env.observation_space.shape[2]} color maps - RGB')

The game is played on a 240 by 256 pixel space, with 3 color maps - RGB


#### 1.2 - SHOW ACTION & OBSERVATION SPACE:

In [55]:
#REDUCE THIS TO JUST RIGHT + B (JUMP) AND NO ACTION - ACTION SPACE FROM 7 TO 2?
print(f'There are {len(LIMITED_ACTIONS)} possible actions, they are:')
print(*LIMITED_ACTIONS)

There are 2 possible actions, they are:
['right'] ['right', 'A']


### 2 - PREPROCESS THE ENVIRONMENT

Environment data is returned to the agent in ``next_state``. As you saw
above, each state is represented by a ``[3, 240, 256]`` size array.
That is more information than our agent needs; for instance,
Mario’s actions do not depend on the color of the pipes or the sky!

We use **Wrappers** to preprocess environment data before sending it to
the agent.

``GrayScaleObservation`` is a common wrapper to transform an RGB image
to grayscale; doing so reduces the size of the state representation
without losing useful information. Now the size of each state:
``[1, 240, 256]``

``ResizeObservation`` downsamples each observation into a square image.
New size: ``[1, 84, 84]``

``SkipFrame`` is a custom wrapper that inherits from ``gym.Wrapper`` and
implements the ``step()`` function. Because consecutive frames don’t
vary much, we can skip n-intermediate frames without losing much
information. The n-th frame aggregates rewards accumulated over each
skipped frame.

``FrameStack`` is a wrapper that allows us to condense consecutive frames
of the environment into a single observation point to feed to our
learning model. This way, we can identify if Mario was landing or
jumping based on the direction of his movement given several of the previous
frames.




In [56]:
'''CONVERT TO GRAYSCALE'''
env = GrayScaleObservation(env, keep_dim=True)

'''RESIZE TO REDUCE COMPUTATION'''
env = ResizeObservation(env, shape=(84, 84))

'''SKIP FRAMES TO REDUCE INPUT SIZE'''
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        """Return only every `skip`-th frame"""
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        """Repeat action, and sum reward"""
        total_reward = 0.0
        for i in range(self._skip):
            # Accumulate reward and repeat the same action
            obs, reward, done, trunc, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, trunc, info

env = SkipFrame(env, skip=4)

'''DUMMY VECTOR ENVIRONMENT WRAPPING TO MAKE TRAINING MORE EFFICIENT VIA PARALLELISM'''
env = DummyVecEnv([lambda: env])

'''STACK MOST RECENT n-FRAMES'''
env = VecFrameStack(env, 4, channels_order='last')

After applying the above wrappers to the environment, the final wrapped
state consists of 4 gray-scaled consecutive frames stacked together, as
shown above in the image on the left. Each time Mario makes an action,
the environment responds with a state of this structure. The structure
is represented by a 3-D array of size ``[4, 84, 84]``.

In [None]:
state = env.reset()
state, reward, done, info = env.step([5])
plt.figure(figsize=(20,16))
for idx in range(state.shape[3]):
    plt.subplot(1,4,idx+1)
    plt.imshow(state[0][:,:,idx])
plt.show()

## 3 - CONFIGURE THE AGENT

We create a class ``Mario`` to represent our agent in the game who:

(a) **Acts** according to the optimal action policy based on the current
   state (of the environment).

(b) **Remembers** experiences. Experience = (current state, current
   action, reward, next state). Mario *caches* and later *recalls* his
   experiences to update his action policy.

(c)  **Learns** a better action policy over time

#### 3.1 - CREATE THE MARIO AGENT CLASS

##### (a) Act Function

For any given state, an agent can choose the most optimal action
(**exploit**) or a random action (**explore**).

Mario randomly explores with a chance of ``self.exploration_rate``; when
he chooses to exploit, he relies on ``MarioNet`` (implemented in
``Learn`` section) to provide the most optimal action.

##### (b) Cache & Recall Function

``cache()``: Each time Mario performs an action, he stores the
``experience`` to his memory. His experience includes the current
*state*, *action* performed, *reward* from the action, the *next state*,
and whether the game is *done*.

``recall()``: Mario randomly samples a batch of experiences from his
memory, and uses that to learn the game.

In [62]:
class Mario:
    def __init__(self, state_dim, action_dim, save_dir, checkpoint=None, epsilon = 1, e_rate_decay = 9.9995e-1,\
                 e_rate_min = 0.1, batch_size = 32, gamma = 0.9, learning_rate = 0.00025, optimiser = 'Adam', loss = 'L1'):

        self.curr_step = 0

        self.burnin = 1e4  # min. experiences before training
        self.learn_every = 3  # no. of experiences between updates to Q_online
        self.sync_every = 1e4  # no. of experiences between Q_target & Q_online sync

        if checkpoint:
            self.load(checkpoint)

        #STATE SPACE
        self.state_dim = state_dim

        #ACTION SPACE
        self.action_dim = action_dim

        '''REFERENCE TO THE CNN ARCHITECTURE'''
        super().__init__(state_dim, action_dim, save_dir, checkpoint)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.net = MarioNet(self.state_dim, self.action_dim).float()
        self.net = self.net.to(device=self.device)

        '''TO STORE PREVIOUS STATES IN A REPLAY BUFFER DICTIONARY'''
        self.memory = TensorDictReplayBuffer(storage=LazyMemmapStorage(100000, device=torch.device("cpu")))
        self.save_dir = save_dir

        '''HYPERPARAMETERS TO TUNE'''
        self.batch_size = batch_size  #NUMBER OF PREVIOUS STATES TO RECALL FROM MEMORY
        self.gamma = gamma #FOR TD PREDICTION

        self.exploration_rate = epsilon
        self.exploration_rate_decay = e_rate_decay
        self.exploration_rate_min = e_rate_min

        self.save_every = 5e5  # no. of experiences between saving MarioNet

        if optimiser == 'Adam':
          self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate)
        if loss == 'L1':
          self.loss_fn = torch.nn.SmoothL1Loss()

    def act(self, state):
        """
        Given a state, choose an epsilon-greedy action and update value of step.

        Inputs:
        state(``LazyFrame``): A single observation of the current state, dimension is (state_dim)
        Outputs:
        ``action_idx`` (``int``): An integer representing which action Mario will perform
        """
        # EXPLORE
        if np.random.rand() < self.exploration_rate:
            #SELECT RANDOM ACTION
            action_idx = np.random.randint(self.action_dim)

        # EXPLOIT
        else:
            state = state[0].__array__() if isinstance(state, tuple) else state.__array__()
            state = torch.tensor(state, device=self.device).unsqueeze(0)
            action_values = self.net(state, model="online")
            action_idx = torch.argmax(action_values, axis=1).item()

        # decrease exploration_rate
        self.exploration_rate *= self.exploration_rate_decay
        self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)

        # increment step
        self.curr_step += 1
        return action_idx

    def cache(self, state, next_state, action, reward, done):
        """
        Store the experience to self.memory (replay buffer)

        Inputs:
        state (``LazyFrame``),
        next_state (``LazyFrame``),
        action (``int``),
        reward (``float``),
        done(``bool``))
        """
        def first_if_tuple(x):
            return x[0] if isinstance(x, tuple) else x
        state = first_if_tuple(state).__array__()
        next_state = first_if_tuple(next_state).__array__()

        state = torch.tensor(state)
        next_state = torch.tensor(next_state)
        action = torch.tensor([action])
        reward = torch.tensor([reward])
        done = torch.tensor([done])

        # self.memory.append((state, next_state, action, reward, done,))
        self.memory.add(TensorDict({"state": state, "next_state": next_state, "action": action, "reward": reward, "done": done}, batch_size=[]))

    def recall(self):
        """
        Retrieve a batch of experiences from memory
        """
        batch = self.memory.sample(self.batch_size).to(self.device)
        state, next_state, action, reward, done = (batch.get(key) for key in ("state", "next_state", "action", "reward", "done"))
        return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()

    def td_estimate(self, state, action):
        current_Q = self.net(state, model="online")[
            np.arange(0, self.batch_size), action
        ]  # Q_online(s,a)
        return current_Q

    @torch.no_grad()
    def td_target(self, reward, next_state, done):
        next_state_Q = self.net(next_state, model="online")
        best_action = torch.argmax(next_state_Q, axis=1)
        next_Q = self.net(next_state, model="target")[
            np.arange(0, self.batch_size), best_action
        ]
        return (reward + (1 - done.float()) * self.gamma * next_Q).float()


    def update_Q_online(self, td_estimate, td_target):
        loss = self.loss_fn(td_estimate, td_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()

    def sync_Q_target(self):
        self.net.target.load_state_dict(self.net.online.state_dict())

    '''SAVE AND LOAD CHECKPOINTS'''
    def save(self):
        save_path = (
            self.save_dir / f"mario_net_{int(self.curr_step // self.save_every)}.chkpt"
        )
        torch.save(
            dict(model=self.net.state_dict(), exploration_rate=self.exploration_rate),
            save_path,
        )
        print(f"MarioNet saved to {save_path} at step {self.curr_step}")

    def load(self, load_path):
        if not load_path.exists():
            raise ValueError(f"{load_path} does not exist")

        ckp = torch.load(load_path, map_location=(self.device))
        exploration_rate = ckp.get('exploration_rate')
        state_dict = ckp.get('model')

        print(f"Loading model at {load_path} with exploration rate {exploration_rate}")
        self.net.load_state_dict(state_dict)
        self.exploration_rate = exploration_rate

    def learn(self):
        if self.curr_step % self.sync_every == 0:
            self.sync_Q_target()

        if self.curr_step % self.save_every == 0:
            self.save()

        if self.curr_step < self.burnin:
            return None, None

        if self.curr_step % self.learn_every != 0:
            return None, None

        # Sample from memory
        state, next_state, action, reward, done = self.recall()

        # Get TD Estimate
        td_est = self.td_estimate(state, action)

        # Get TD Target
        td_tgt = self.td_target(reward, next_state, done)

        # Backpropagate loss through Q_online
        loss = self.update_Q_online(td_est, td_tgt)

        return (td_est.mean().item(), loss)

####3.2 - INITIALISE THE MARIO AGENT

In [None]:
mario_agent = Mario()

### 3.3 - CREATE MARIONET, THE CNN ARCHITECTURE

Mario uses the [DDQN algorithm](https://arxiv.org/pdf/1509.06461)_
under the hood. DDQN uses two ConvNets - $Q_{online}$ and
$Q_{target}$ - that independently approximate the optimal
action-value function.

In our implementation, we share feature generator ``features`` across
$Q_{online}$ and $Q_{target}$, but maintain separate FC
classifiers for each. $\theta_{target}$ (the parameters of
$Q_{target}$) is frozen to prevent updating by backprop. Instead,
it is periodically synced with $\theta_{online}$ (more on this
later).

In [51]:
class MarioNet(nn.Module):
    """mini CNN structure
    input -> (conv2d + relu) x 3 -> flatten -> (dense + relu) x 2 -> output
    """

    def __init__(self, input_dim, output_dim):
        super().__init__()
        c, h, w = input_dim

        if h != 84:
            raise ValueError(f"Expecting input height: 84, got: {h}")
        if w != 84:
            raise ValueError(f"Expecting input width: 84, got: {w}")

        self.online = nn.Sequential(
            nn.Conv2d(in_channels=c, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim),
        )

        self.target = copy.deepcopy(self.online)

        # Q_target parameters are frozen.
        for p in self.target.parameters():
            p.requires_grad = False

    def forward(self, input, model):
        if model == "online":
            return self.online(input)
        elif model == "target":
            return self.target(input)

###3.4 - REVIEW OF VALUE METHODS OF REINFORCEMENT LEARNING

#### TD Estimate & TD Target

Two values are involved in learning:

**TD Estimate** - the predicted optimal $Q^*$ for a given state
$s$

\begin{align}{TD}_e = Q_{online}^*(s,a)\end{align}

**TD Target** - aggregation of current reward and the estimated
$Q^*$ in the next state $s'$

\begin{align}a' = argmax_{a} Q_{online}(s', a)\end{align}

\begin{align}{TD}_t = r + \gamma Q_{target}^*(s',a')\end{align}

Because we don’t know what next action $a'$ will be, we use the
action $a'$ maximizes $Q_{online}$ in the next state
$s'$.



#### Updating the model

As Mario samples inputs from his replay buffer, we compute $TD_t$
and $TD_e$ and backpropagate this loss down $Q_{online}$ to
update its parameters $\theta_{online}$ ($\alpha$ is the
learning rate ``lr`` passed to the ``optimizer``)

\begin{align}\theta_{online} \leftarrow \theta_{online} + \alpha \nabla(TD_e - TD_t)\end{align}

$\theta_{target}$ does not update through backpropagation.
Instead, we periodically copy $\theta_{online}$ to
$\theta_{target}$

\begin{align}\theta_{target} \leftarrow \theta_{online}\end{align}


### 3.5 - LOG PROGRESS

In [None]:
class ModelSaver(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True


class MetricLogger:
    def __init__(self, save_dir):
        self.save_log = save_dir / "log"
        with open(self.save_log, "w") as f:
            f.write(
                f"{'Episode':>8}{'Step':>8}{'Epsilon':>10}{'MeanReward':>15}"
                f"{'MeanLength':>15}{'MeanLoss':>15}{'MeanQValue':>15}"
                f"{'TimeDelta':>15}{'Time':>20}\n"
            )
        self.ep_rewards_plot = save_dir / "reward_plot.jpg"
        self.ep_lengths_plot = save_dir / "length_plot.jpg"
        self.ep_avg_losses_plot = save_dir / "loss_plot.jpg"
        self.ep_avg_qs_plot = save_dir / "q_plot.jpg"

        # History metrics
        self.ep_rewards = []
        self.ep_lengths = []
        self.ep_avg_losses = []
        self.ep_avg_qs = []

        # Moving averages, added for every call to record()
        self.moving_avg_ep_rewards = []
        self.moving_avg_ep_lengths = []
        self.moving_avg_ep_avg_losses = []
        self.moving_avg_ep_avg_qs = []

        # Current episode metric
        self.init_episode()

        # Timing
        self.record_time = time.time()

    def log_step(self, reward, loss, q):
        self.curr_ep_reward += reward
        self.curr_ep_length += 1
        if loss:
            self.curr_ep_loss += loss
            self.curr_ep_q += q
            self.curr_ep_loss_length += 1

    def log_episode(self):
        "Mark end of episode"
        self.ep_rewards.append(self.curr_ep_reward)
        self.ep_lengths.append(self.curr_ep_length)
        if self.curr_ep_loss_length == 0:
            ep_avg_loss = 0
            ep_avg_q = 0
        else:
            ep_avg_loss = np.round(self.curr_ep_loss / self.curr_ep_loss_length, 5)
            ep_avg_q = np.round(self.curr_ep_q / self.curr_ep_loss_length, 5)
        self.ep_avg_losses.append(ep_avg_loss)
        self.ep_avg_qs.append(ep_avg_q)

        self.init_episode()

    def init_episode(self):
        self.curr_ep_reward = 0.0
        self.curr_ep_length = 0
        self.curr_ep_loss = 0.0
        self.curr_ep_q = 0.0
        self.curr_ep_loss_length = 0

    def record(self, episode, epsilon, step):
        mean_ep_reward = np.round(np.mean(self.ep_rewards[-100:]), 3)
        mean_ep_length = np.round(np.mean(self.ep_lengths[-100:]), 3)
        mean_ep_loss = np.round(np.mean(self.ep_avg_losses[-100:]), 3)
        mean_ep_q = np.round(np.mean(self.ep_avg_qs[-100:]), 3)
        self.moving_avg_ep_rewards.append(mean_ep_reward)
        self.moving_avg_ep_lengths.append(mean_ep_length)
        self.moving_avg_ep_avg_losses.append(mean_ep_loss)
        self.moving_avg_ep_avg_qs.append(mean_ep_q)

        last_record_time = self.record_time
        self.record_time = time.time()
        time_since_last_record = np.round(self.record_time - last_record_time, 3)

        print(
            f"Episode {episode} - "
            f"Step {step} - "
            f"Epsilon {epsilon} - "
            f"Mean Reward {mean_ep_reward} - "
            f"Mean Length {mean_ep_length} - "
            f"Mean Loss {mean_ep_loss} - "
            f"Mean Q Value {mean_ep_q} - "
            f"Time Delta {time_since_last_record} - "
            f"Time {datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}"
        )

        with open(self.save_log, "a") as f:
            f.write(
                f"{episode:8d}{step:8d}{epsilon:10.3f}"
                f"{mean_ep_reward:15.3f}{mean_ep_length:15.3f}{mean_ep_loss:15.3f}{mean_ep_q:15.3f}"
                f"{time_since_last_record:15.3f}"
                f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'):>20}\n"
            )

        for metric in ["ep_lengths", "ep_avg_losses", "ep_avg_qs", "ep_rewards"]:
            plt.clf()
            plt.plot(getattr(self, f"moving_avg_{metric}"), label=f"moving_avg_{metric}")
            plt.legend()
            plt.savefig(getattr(self, f"{metric}_plot"))

In [None]:
'''INITIALISE MODEL SAVER'''
CHECKPOINT_DIR = './train/'
LOG_DIR = './logs/'
# Setup model saving callback
callback = ModelSaver(check_freq=10000, save_path=CHECKPOINT_DIR)

## 4 - TRAIN THE MODEL



####4.1 - Using PPO: A Policy Gradient Method

In [None]:
ppo_learning_rate = 0.00001
ppo_num_steps = 512
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate= ppo_learning_rate,
            n_steps=ppo_num_steps)

# Train the AI model, this is where the AI model starts to learn
model.learn(total_timesteps=1000000, callback=callback)

model.save('thisisatestmodel')

# Load model
model = PPO.load('./train/best_model_1000000')

state = env.reset()
# Start the game
state = env.reset()
# Loop through the game
while True:

    action, _ = model.predict(state)
    state, reward, done, info = env.step(action)
    env.render()

####4.2 - Using Double Deep Q Learning (DDQL)

In [None]:


use_cuda = torch.cuda.is_available()
print(f"Using CUDA: {use_cuda}")
print()

save_dir = Path("checkpoints") / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
save_dir.mkdir(parents=True)

mario = Mario(state_dim=(4, 84, 84), action_dim=env.action_space.n, save_dir=save_dir,\
              checkpoint = None, epsilon = 1, e_rate_decay = 9.9995e-1,\
                 e_rate_min = 0.1, batch_size = 32, gamma = 0.9,\
               learning_rate = 0.00025, optimiser = 'Adam', loss = 'L1')

logger = MetricLogger(save_dir)

episodes = 40000
for e in range(episodes):

    state = env.reset()

    while True:

        # Run agent on the state
        action = mario.act(state)

        # Agent performs action
        next_state, reward, done, trunc, info = env.step(action)

        # Remember
        mario.cache(state, next_state, action, reward, done)

        # Learn
        q, loss = mario.learn()

        # Logging
        logger.log_step(reward, loss, q)

        # Update state
        state = next_state

        # Check if end of game
        if done or info["flag_get"]:
            break

    logger.log_episode()

    if e % 20 == 0:
        logger.record(episode=e, epsilon=mario.exploration_rate, step=mario.curr_step)

#EXTRA TO IMPLEMENT LATER???

In [None]:
# Wrap the environment with RecordEpisodeStatistics
env = RecordEpisodeStatistics(env)

# Training loop
num_episodes = 1000
for episode in range(num_episodes):
    observation = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = agent.select_action(observation)
        observation, reward, done, _ = env.step(action)
        total_reward += reward

    print(f"Episode {episode + 1}: Total Reward = {total_reward}")



    # Wrap the environment with the RecordVideo wrapper
env = RecordVideo(env, video_path='/path/to/output/video.mp4')

# Run the agent in the environment
for _ in range(1000):
    action = env.action_space.sample()  # Replace with your agent's action
    observation, reward, done, info = env.step(action)
    if done:
        break

# Close the environment, which finalizes and saves the video
env.close()


In [None]:
# if gym.__version__ < '0.26':
#     env = gym_super_mario_bros.make("SuperMarioBros-1-1-v1", new_step_api=True)
# else:
#     env = gym_super_mario_bros.make("SuperMarioBros-1-1-v1", render_mode='rgb', apply_api_compatibility=True)

# env = JoypadSpace(
#     env,
#     [['right'],
#     ['right', 'A']]
# )

# env = SkipFrame(env, skip=4)
# env = GrayScaleObservation(env)
# env = ResizeObservation(env, shape=84)
# env = FrameStack(env, num_stack=4)

# env.reset()

# save_dir = Path('checkpoints') / datetime.datetime.now().strftime('%Y-%m-%dT%H-%M-%S')
# save_dir.mkdir(parents=True)

# checkpoint = Path('checkpoints/2023-11-01T14-58-44/mario_net_0.chkpt')
# mario = Mario(state_dim=(4, 84, 84), action_dim=env.action_space.n, save_dir=save_dir, checkpoint=checkpoint)
# mario.exploration_rate = mario.exploration_rate_min

# logger = MetricLogger(save_dir)

# episodes = 100

# for e in range(episodes):

#     state = env.reset()

#     while True:

#         env.render()

#         action = mario.act(state)

#         next_state, reward, done, info = env.step(action)

#         mario.cache(state, next_state, action, reward, done)

#         logger.log_step(reward, None, None)

#         state = next_state

#         if done or info['flag_get']:
#             break

#     logger.log_episode()

#     if e % 20 == 0:
#         logger.record(
#             episode=e,
#             epsilon=mario.exploration_rate,
#             step=mario.curr_step
#         )

In [None]:
manual preprocessing



# MANUAL APPLICATIONS
# class FrameStack:
#     def __init__(self, env, num_stack):
#         self.env = env
#         self.num_stack = num_stack
#         self.frames = deque(maxlen=num_stack)

#     def reset(self):
#         obs = self.env.reset()
#         for _ in range(self.num_stack):
#             self.frames.append(obs)
#         return np.stack(self.frames, axis=0)

#     def step(self, action):
#         obs, reward, done, info = self.env.step(action)
#         self.frames.append(obs)
#         return np.stack(self.frames, axis=0), reward, done, info

# class GrayScaleObservation(gym.ObservationWrapper):
#     def __init__(self, env):
#         super().__init__(env)
#         obs_shape = self.observation_space.shape[:2]
#         self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

#     def permute_orientation(self, observation):
#         # permute [H, W, C] array to [C, H, W] tensor
#         observation = np.transpose(observation, (2, 0, 1))
#         observation = torch.tensor(observation.copy(), dtype=torch.float)
#         return observation

#     def observation(self, observation):
#         observation = self.permute_orientation(observation)
#         transform = T.Grayscale()
#         observation = transform(observation)
#         return observation


# class ResizeObservation(gym.ObservationWrapper):
#     def __init__(self, env, shape):
#         super().__init__(env)
#         if isinstance(shape, int):
#             self.shape = (shape, shape)
#         else:
#             self.shape = tuple(shape)

#         obs_shape = self.shape + self.observation_space.shape[2:]
#         self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

#     def observation(self, observation):
#         transforms = T.Compose(
#             [T.Resize(self.shape), T.Normalize(0, 255)]
#         )
#         observation = transforms(observation).squeeze(0)
#         return observation