In [2]:
!pip install gym_super_mario_bros==7.3.0 nes_py

Collecting gym_super_mario_bros==7.3.0
  Downloading gym_super_mario_bros-7.3.0-py2.py3-none-any.whl.metadata (9.4 kB)
Collecting nes_py
  Downloading nes_py-8.2.1.tar.gz (77 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/77.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.7/77.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyglet<=1.5.21,>=1.4.0 (from nes_py)
  Downloading pyglet-1.5.21-py3-none-any.whl.metadata (7.6 kB)
Downloading gym_super_mario_bros-7.3.0-py2.py3-none-any.whl (198 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyglet-1.5.21-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: nes_py


In [15]:
import gym
import gym_super_mario_bros
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
from gym.wrappers import GrayScaleObservation,StepAPICompatibility
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
from torch.distributions import Categorical

In [11]:
# Custom Frame Stack Wrapper
class CustomVecFrameStack:
    def __init__(self, env, n_stack):
        self.env = env
        self.n_stack = n_stack
        self.frames = deque(maxlen=n_stack)
        obs_shape = self.env.observation_space.shape
        self.observation_space = gym.spaces.Box(
            low=0,
            high=255,
            shape=(obs_shape[0], obs_shape[1], n_stack),
            dtype=np.uint8,
        )
        self.action_space = env.action_space

    def reset(self):
        obs = self.env.reset()
        for _ in range(self.n_stack):
            self.frames.append(obs)
        return self._get_stacked_frames()

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.frames.append(obs)
        if done:
            obs = self.env.reset()
            for _ in range(self.n_stack):
                self.frames.append(obs)
        return self._get_stacked_frames(), reward, done, info

    def _get_stacked_frames(self):
        return np.stack(self.frames, axis=-1)

In [12]:
# Actor-Critic Network
class ActorCritic(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ActorCritic, self).__init__()
        channels, height, width = input_dim
        self.shared = nn.Sequential(
            nn.Conv2d(channels, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(64 * 7 * 7, 512),
            nn.ReLU(),
        )
        self.actor = nn.Linear(512, output_dim)
        self.critic = nn.Linear(512, 1)

    def forward(self, x):
        shared_features = self.shared(x)
        return self.actor(shared_features), self.critic(shared_features)

    def get_action_and_value(self, x):
        logits, value = self.forward(x)
        action_probs = Categorical(logits=logits)
        action = action_probs.sample()
        return action, action_probs.log_prob(action), value

In [13]:
# PPO Algorithm
class PPO:
    def __init__(self, env, input_dim, output_dim, lr=3e-4, gamma=0.99, eps_clip=0.2, gae_lambda=0.95):
        self.env = env
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.gae_lambda = gae_lambda
        self.policy = ActorCritic(input_dim, output_dim).cuda()
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)

    def compute_advantages(self, rewards, values, dones):
        advantages = []
        gae = 0
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + self.gamma * (1 - dones[step]) * values[step + 1] - values[step]
            gae = delta + self.gamma * self.gae_lambda * (1 - dones[step]) * gae
            advantages.insert(0, gae)
        return advantages

    def update(self, trajectories):
        obs, actions, log_probs, rewards, dones, values = trajectories
        advantages = self.compute_advantages(rewards, values, dones)
        advantages = torch.tensor(advantages).cuda()
        values = torch.tensor(values[:-1]).cuda()
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        for _ in range(10):  # PPO epochs
            _, new_log_probs, new_values = self.policy.get_action_and_value(obs)
            ratios = (new_log_probs - log_probs).exp()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = ((new_values - values) ** 2).mean()
            loss = actor_loss + 0.5 * critic_loss
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

In [14]:
# Training Loop
def train_ppo(env, ppo, num_steps=2048, n_stack=4):
    obs_stack = deque(maxlen=n_stack)
    obs = env.reset()
    for _ in range(n_stack):
        obs_stack.append(obs)

    trajectories = {"obs": [], "actions": [], "log_probs": [], "rewards": [], "dones": [], "values": []}
    for step in range(num_steps):
        stacked_obs = np.stack(list(obs_stack), axis=-1)  # Stack frames
        obs_tensor = torch.tensor(stacked_obs, dtype=torch.float32).unsqueeze(0).permute(0, 3, 1, 2).cuda()

        action, log_prob, value = ppo.policy.get_action_and_value(obs_tensor)
        new_obs, reward, done, _ = env.step(action.cpu().numpy()[0])

        trajectories["obs"].append(obs_tensor)
        trajectories["actions"].append(action)
        trajectories["log_probs"].append(log_prob)
        trajectories["rewards"].append(reward)
        trajectories["dones"].append(done)
        trajectories["values"].append(value)

        obs_stack.append(new_obs)
        if done:
            obs = env.reset()
            for _ in range(n_stack):
                obs_stack.append(obs)

    for key in trajectories:
        trajectories[key] = torch.cat(trajectories[key], dim=0).cuda()
    ppo.update(trajectories)


In [16]:
# Initialize the environment
env = gym_super_mario_bros.make("SuperMarioBros-v1")
env = StepAPICompatibility(env, new_step_api=True)
env = JoypadSpace(env, SIMPLE_MOVEMENT)
env = GrayScaleObservation(env, keep_dim=True)
env = CustomVecFrameStack(env, n_stack=4)

# Define PPO agent
ppo = PPO(env, input_dim=(4, 240, 256), output_dim=env.action_space.n)

# Train PPO
train_ppo(env, ppo, num_steps=10000)

  logger.warn(


RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 6 is not equal to len(dims) = 4