In [12]:
# Alle notwendigen Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

from collections import deque
import numpy as np
import random
import os

import gymnasium as gym

# Reproduzierbarkeit
seed = 1
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

# Deterministisches Verhalten für CUDA-Operationen
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

print("✅ Imports und Konfiguration geladen.")

✅ Imports und Konfiguration geladen.


In [13]:
%load_ext tensorboard
%xmode Verbose

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
Exception reporting mode: Verbose


In [14]:
#GPU-Nutzung prüfen und einstellen
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [15]:
#Hyperparameter
env_name = "CartPole-v1"
gamma = 0.99 #Discount Faktor
learning_rate = 0.00025 #Gewichtsanpassung

#Epsilon Greedy
eps_start = 0.5 #Start: Nur Ausprobieren
eps = eps_start
eps_end = 0.05
eps_decay = 1000 #Absenkung

train_freq = 1
min_replay_size = 1000
batch_size = 64
gamma = 0.99
target_update_freq = 1000
batch_size = 64
buffer_size = 100_000
min_replay_size = 1_000
train_freq = 1

max_episodes = 500 #default
max_steps = 500 #default
train_episodes = max_episodes
decay_factor = (eps_start - eps_end) / train_episodes

In [16]:
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

In [17]:
class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim) #output = die Anzahl aller möglichen Aktionen
        )

    def forward(self, x):
        return self.model(x)

In [18]:
class ReplayBuffer:
    def __init__(self, capacity): #capacity ist die Länge der dequeue
        self.buffer = deque(maxlen=capacity) #double ended queue

    def push(self, state, action, reward, next_state, done): #siehe http://www.incompleteideas.net/book/ebook/figtmp7.png
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        samples = random.sample(self.buffer, batch_size) #random weil siehe Doku Quellen dazu
        states, actions, rewards, next_states, dones = zip(*samples)
        return (
            torch.tensor(states, dtype=torch.float32, device=device),
            torch.tensor(actions, dtype=torch.int64, device=device).unsqueeze(1), #https://docs.pytorch.org/docs/main/generated/torch.unsqueeze.html
            torch.tensor(rewards, dtype=torch.float32, device=device).unsqueeze(1),
            torch.tensor(next_states, dtype=torch.float32, device=device),
            torch.tensor(dones, dtype=torch.float32, device=device).unsqueeze(1),
        )

    def __len__(self):
        return len(self.buffer)

In [19]:
def getEpsilonLin():
    global eps
    eps = max(eps - decay_factor, eps_end)
    return eps

In [20]:
policy_net = QNetwork(state_dim, action_dim).to(device)
target_net = QNetwork(state_dim, action_dim).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
replay_buffer = ReplayBuffer(buffer_size)
writer = SummaryWriter()

global_step = 0
episode_rewards = []

In [21]:
for episode in range(max_episodes):
    state, _ = env.reset()
    state = np.array(state, dtype=np.float32)
    total_reward = 0

    for step in range(max_steps):
        epsilon = getEpsilonLin()  #linear pro Step
        global_step += 1

        #Action wählen
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                state_tensor = torch.tensor(state, device=device).unsqueeze(0)
                q_values = policy_net(state_tensor)
                action = q_values.argmax(1).item()

        #Schritt im Environment
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        next_state = np.array(next_state, dtype=np.float32)

        #Reward Clipping derzeit aus, für cartpole nicht nötig
        #clipped_reward = max(min(reward, 1.0), -1.0)
        clipped_reward = reward

        #In Replay Buffer speichern
        replay_buffer.push(state, action, clipped_reward, next_state, done)

        state = next_state
        total_reward += reward

        #Trainingz
        if len(replay_buffer) >= min_replay_size and global_step % train_freq == 0:
            states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)

            with torch.no_grad():
                #Double DQN: Aktion über policy, Q über target
                next_actions = policy_net(next_states).argmax(1, keepdim=True)
                next_q_values = target_net(next_states).gather(1, next_actions)
                target_q = rewards + (1 - dones) * gamma * next_q_values

            current_q = policy_net(states).gather(1, actions)
            loss = nn.SmoothL1Loss()(current_q, target_q)  #Huber

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Target Network Update
        if global_step % target_update_freq == 0:
            target_net.load_state_dict(policy_net.state_dict())

        if done:
            break

    #Logging
    writer.add_scalar("Reward", total_reward, episode)
    writer.add_scalar("Epsilon", epsilon, episode)
    episode_rewards.append(total_reward)

    print(f"Episode {episode} - Total Reward: {total_reward:.2f} - Epsilon: {epsilon:.3f}")

env.close()
writer.close()
#successful_episodes = sum(1 for r in episode_rewards if r > 450)
#perfect_episodes = sum(1 for r in episode_rewards if r == 500)
#print(f"Episoden mit Reward >450: {successful_episodes}")
#print(f"Perfekte Episoden: {perfect_episodes}")

Episode 0 - Total Reward: 13.00 - Epsilon: 0.488
Episode 1 - Total Reward: 15.00 - Epsilon: 0.475
Episode 2 - Total Reward: 14.00 - Epsilon: 0.462
Episode 3 - Total Reward: 14.00 - Epsilon: 0.450
Episode 4 - Total Reward: 13.00 - Epsilon: 0.438
Episode 5 - Total Reward: 11.00 - Epsilon: 0.428
Episode 6 - Total Reward: 11.00 - Epsilon: 0.418
Episode 7 - Total Reward: 9.00 - Epsilon: 0.410
Episode 8 - Total Reward: 9.00 - Epsilon: 0.402
Episode 9 - Total Reward: 11.00 - Epsilon: 0.392
Episode 10 - Total Reward: 19.00 - Epsilon: 0.375
Episode 11 - Total Reward: 12.00 - Epsilon: 0.364
Episode 12 - Total Reward: 14.00 - Epsilon: 0.351
Episode 13 - Total Reward: 14.00 - Epsilon: 0.339
Episode 14 - Total Reward: 10.00 - Epsilon: 0.330
Episode 15 - Total Reward: 12.00 - Epsilon: 0.319
Episode 16 - Total Reward: 10.00 - Epsilon: 0.310
Episode 17 - Total Reward: 10.00 - Epsilon: 0.301
Episode 18 - Total Reward: 10.00 - Epsilon: 0.292
Episode 19 - Total Reward: 9.00 - Epsilon: 0.284
Episode 20 - 