In [20]:
import gym
import cv2

#This environment wrapper is used to stop a run if mario is stuck on a pipe
class DeadlockEnv(gym.Wrapper):
    def __init__(self, env, threshold=20):
        super().__init__(env)
        self.last_x_pos = 0
        self.count = 0
        self.threshold = threshold
        self.lifes = 3
        self.stage = 1
        self.world = 1

    def reset(self, **kwargs):
        self.last_x_pos = 0
        self.count = 0
        return self.env.reset(**kwargs)

    def step(self, action):
        state, reward, done, info = self.env.step(action)
        x_pos = info['x_pos']

        if x_pos <= self.last_x_pos:
            self.count += 1
        else:
            self.count = 0
            self.last_x_pos = x_pos

        if info['life'] != self.lifes or info["stage"] != self.stage or info["world"] != self.world:
            self.last_x_pos = x_pos
            self.count = 0
            self.lifes = info['life']
            self.stage = info["stage"]
            self.world = info["world"]

        if self.count >= self.threshold:
            reward = -15
            done = True

        return state, reward, done, info

#skipframe wrapper
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        reward_out = 0
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            reward_out += reward
            if done:
                break
        reward_out /= max(1,i+1)

        return obs, reward_out, done, info

#downsample wrapper to reduce dimensionality
def Downsample(ratio,state):
  (oldh, oldw, oldc) = state.shape
  newshape = (oldh//ratio, oldw//ratio, oldc)
  frame = cv2.resize(state, (newshape[0], newshape[1]), interpolation=cv2.INTER_AREA)
  return frame

#small function to change rgb images to grayscale
def GrayScale(state):
  return cv2.cvtColor(state, cv2.COLOR_RGB2GRAY)

In [21]:
import numpy as np
import torch.nn as nn
from torch.distributions import Categorical

class Actor_Critic:
    def __init__(self, env):
        self.env = env

        self.actor = nn.Sequential(
                            nn.Linear(3840, 64),
                            nn.Tanh(),
                            nn.Linear(64, 64),
                            nn.Tanh(),
                            nn.Linear(64, 7),
                            nn.Softmax(dim=-1)
                        )
        self.critic = nn.Sequential(
                        nn.Linear(3840, 64),
                        nn.Tanh(),
                        nn.Linear(64, 64),
                        nn.Tanh(),
                        nn.Linear(64, 1)
                    )

    def act(self, state):
        # if we have a continuous action space we sample from a multivariate normal distribution
        # otherwise we calculate a categorical action space
        action_probs = self.actor(state)
        dist = Categorical(action_probs)

        action = dist.sample()
        action_logprob = dist.log_prob(action)

        return action.detach(), action_logprob.detach()

    def evaluate(self, state, action):
        action_probs = self.actor(state)
        dist = Categorical(action_probs)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)

        return action_logprobs, state_values, dist_entropy

In [22]:
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
import numpy as np
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import torch.optim as optim
import collections
import keyboard
import torch

frameskip = 4

env = gym_super_mario_bros.make('SuperMarioBros-v1')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
env = SkipFrame(env, skip=frameskip)
env = DeadlockEnv(env,threshold=(60*2)//frameskip)

done = True

model = Actor_Critic(env)
actor_optimizer = optim.Adam(model.actor.parameters(), lr=1e-4)
critic_optimizer = optim.Adam(model.critic.parameters(), lr=1e-3)
gamma = 0.99
n_steps = 5
state = env.reset()
step = 0

states, actions, rewards, logprobs = [], [], [], []

for step in range(5000):
    if done:
        state = env.reset()

    # Zustand korrekt vorverarbeiten
    processed_state = GrayScale(Downsample(4, state)).flatten()
    processed_tensor = torch.from_numpy(processed_state).float()

    action, logprob = model.act(processed_tensor)
    next_state, reward, done, _ = env.step(action.item())

    # Vorverarbeiteten nächsten Zustand vorbereiten
    next_helper = next_state.copy()

    # 3. Speichern — hier ist die Änderung: verarbeiteten Zustand speichern
    states.append(processed_tensor.unsqueeze(0))  # richtiges Format
    actions.append(action)
    logprobs.append(logprob)
    rewards.append(torch.tensor([reward], dtype=torch.float32))

    state = next_helper
    step += 1

    if step % n_steps == 0 or done:
        with torch.no_grad():
            next_processed = GrayScale(Downsample(4, state)).flatten()
            next_tensor = torch.from_numpy(next_processed).float()
            next_value = model.critic(next_tensor) if not done else torch.tensor([[0.0]])

        returns = []
        R = next_value
        for r in reversed(rewards):
            R = r + gamma * R
            returns.insert(0, R)

        states_tensor = torch.cat(states)
        actions_tensor = torch.stack(actions)
        logprobs_tensor = torch.stack(logprobs)
        returns_tensor = torch.cat(returns).detach()

        new_logprobs, values, entropy = model.evaluate(states_tensor, actions_tensor)
        advantage = returns_tensor - values.squeeze()

        policy_loss = -(new_logprobs * advantage.detach()).mean()
        value_loss = advantage.pow(2).mean()
        entropy_bonus = entropy.mean()
        total_loss = policy_loss + 0.5 * value_loss - 0.01 * entropy_bonus

        actor_optimizer.zero_grad()
        critic_optimizer.zero_grad()
        total_loss.backward()
        actor_optimizer.step()
        critic_optimizer.step()

        states, actions, rewards, logprobs = [], [], [], []

    if step % 200 == 0:
        print(np.shape(GrayScale(Downsample(12,state))))
        print('_____________________________')
    env.render()

env.close()

(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________
(21, 20)
_____________________________


In [23]:
"""

import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque

# Setup
env = ...  # deine Mario-Umgebung
model = Actor_Critic(env)
actor_optimizer = optim.Adam(model.actor.parameters(), lr=1e-4)
critic_optimizer = optim.Adam(model.critic.parameters(), lr=1e-3)
gamma = 0.99
n_steps = 5

# Initialisierung
state = env.reset()
state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)  # (1, 420)

# Puffer für Trajektorie
states, actions, rewards, logprobs = [], [], [], []

done = False
step = 0

while not done:
    # 1. Aktion wählen
    action, logprob = model.act(state)

    # 2. Umgebungsschritt
    next_state, reward, done, _ = env.step(action.item())
    next_state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0)

    # 3. Speichern
    states.append(state)
    actions.append(action)
    logprobs.append(logprob)
    rewards.append(torch.tensor([reward], dtype=torch.float32))



    state = next_state
    step += 1

    # Wenn n Schritte oder done → Update
    if step % n_steps == 0 or done:
        # 4. Bootstrapped Return
        with torch.no_grad():
            next_value = model.critic(next_state) if not done else torch.tensor([[0.0]])

        returns = []
        R = next_value
        for r in reversed(rewards):
            R = r + gamma * R
            returns.insert(0, R)

        # 5. Stack tensors
        states_tensor = torch.cat(states)
        actions_tensor = torch.stack(actions)
        logprobs_tensor = torch.stack(logprobs)
        returns_tensor = torch.cat(returns).detach()

        # 6. Evaluate aktuelle Policy
        new_logprobs, values, entropy = model.evaluate(states_tensor, actions_tensor)
        advantage = returns_tensor - values.squeeze()


########

        # 7. Verluste berechnen
        policy_loss = -(new_logprobs * advantage.detach()).mean()
        value_loss = advantage.pow(2).mean()
        entropy_bonus = entropy.mean()

        total_loss = policy_loss + 0.5 * value_loss - 0.01 * entropy_bonus

        # 8. Update
        actor_optimizer.zero_grad()
        critic_optimizer.zero_grad()
        total_loss.backward()
        actor_optimizer.step()
        critic_optimizer.step()

        # 9. Reset Puffer
        states, actions, rewards, logprobs = [], [], [], []
"""

'\n\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\nfrom collections import deque\n\n# Setup\nenv = ...  # deine Mario-Umgebung\nmodel = Actor_Critic(env)\nactor_optimizer = optim.Adam(model.actor.parameters(), lr=1e-4)\ncritic_optimizer = optim.Adam(model.critic.parameters(), lr=1e-3)\ngamma = 0.99\nn_steps = 5\n\n# Initialisierung\nstate = env.reset()\nstate = torch.tensor(state, dtype=torch.float32).unsqueeze(0)  # (1, 420)\n\n# Puffer für Trajektorie\nstates, actions, rewards, logprobs = [], [], [], []\n\ndone = False\nstep = 0\n\nwhile not done:\n    # 1. Aktion wählen\n    action, logprob = model.act(state)\n\n    # 2. Umgebungsschritt\n    next_state, reward, done, _ = env.step(action.item())\n    next_state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0)\n\n    # 3. Speichern\n    states.append(state)\n    actions.append(action)\n    logprobs.append(logprob)\n    rewards.append(torch.tensor([reward], dtype=torch.float32))\n\n\n\n    state = next_