In [None]:
!pip install gymnasium

[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
import gymnasium as gym
import numpy as np
import random
from collections import deque, defaultdict
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [None]:
env = gym.make("Blackjack-v1", natural=False, sab=False)
n_actions = env.action_space.n

***Model sieci DQN***

In [None]:
class DQN(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(3, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, n_actions)
        )

    def forward(self, x):
        return self.net(x)

pamięć doświadczeń

In [None]:
class ReplayBuffer:
    def __init__(self, size=50000):
        self.buffer = deque(maxlen=size)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            torch.tensor(states, dtype=torch.float32),
            torch.tensor(actions),
            torch.tensor(rewards, dtype=torch.float32),
            torch.tensor(next_states, dtype=torch.float32),
            torch.tensor(dones, dtype=torch.float32),
        )

    def __len__(self):
        return len(self.buffer)


***Funkcja wyboru akcji***

In [None]:
def choose_action(state, epsilon):
    if random.random() < epsilon:
        return env.action_space.sample()
    state = torch.tensor(state, dtype=torch.float32)
    q_values = policy_net(state)
    return torch.argmax(q_values).item()

Trening agenta **DQN**

In [None]:
policy_net = DQN()
target_net = DQN()
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.Adam(policy_net.parameters(), lr=0.0005)
memory = ReplayBuffer()

gamma = 1.0
epsilon = 1.0
epsilon_min = 0.05
epsilon_decay = 0.9995
batch_size = 64
target_update_freq = 2000
num_episodes = 100000


def train_step():
    if len(memory) < batch_size:
        return

    states, actions, rewards, next_states, dones = memory.sample(batch_size)

    # Q(s, a)
    q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze()

    # max_a' Q_target(s', a')
    next_q_values = target_net(next_states).max(1)[0]

    # Bellman target
    targets = rewards + gamma * next_q_values * (1 - dones)

    loss = nn.MSELoss()(q_values, targets.detach())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()


rewards_history = []

steps = 0

for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    total_reward = 0

    while not done:
        steps += 1

        # epsilon decay
        epsilon = max(epsilon_min, epsilon * epsilon_decay)

        # wybór akcji
        action = choose_action(state, epsilon)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        total_reward += reward

        memory.push(state, action, reward, next_state, done)
        state = next_state

        loss = train_step()

        # target network update
        if steps % target_update_freq == 0:
            target_net.load_state_dict(policy_net.state_dict())

    rewards_history.append(total_reward)

    if episode % 5000 == 0:
        print(f"Episode {episode}, mean reward: {np.mean(rewards_history[-5000:]):.3f}, epsilon={epsilon:.3f}")

Episode 0, mean reward: -1.000, epsilon=0.999
Episode 5000, mean reward: -0.168, epsilon=0.050
Episode 10000, mean reward: -0.068, epsilon=0.050
Episode 15000, mean reward: -0.066, epsilon=0.050
Episode 20000, mean reward: -0.061, epsilon=0.050
Episode 25000, mean reward: -0.088, epsilon=0.050
Episode 30000, mean reward: -0.081, epsilon=0.050
Episode 35000, mean reward: -0.074, epsilon=0.050
Episode 40000, mean reward: -0.065, epsilon=0.050
Episode 45000, mean reward: -0.110, epsilon=0.050
Episode 50000, mean reward: -0.089, epsilon=0.050
Episode 55000, mean reward: -0.060, epsilon=0.050
Episode 60000, mean reward: -0.067, epsilon=0.050
Episode 65000, mean reward: -0.080, epsilon=0.050
Episode 70000, mean reward: -0.060, epsilon=0.050
Episode 75000, mean reward: -0.054, epsilon=0.050
Episode 80000, mean reward: -0.081, epsilon=0.050
Episode 85000, mean reward: -0.072, epsilon=0.050
Episode 90000, mean reward: -0.075, epsilon=0.050
Episode 95000, mean reward: -0.051, epsilon=0.050


Ewaluacja

In [None]:
def evaluate(policy_fn, n_games=50000):
    wins = losses = draws = 0

    for _ in range(n_games):
        state, _ = env.reset()
        done = False

        while not done:
            action = policy_fn(state)

            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

        if reward > 0: wins += 1
        elif reward < 0: losses += 1
        else: draws += 1

    return wins, losses, draws


In [None]:
def dqn_policy(state):
    state_tensor = torch.tensor(state, dtype=torch.float32)
    return torch.argmax(policy_net(state_tensor)).item()

def basic_strategy(state):
    player_sum, dealer_card, usable_ace = state
    return 0 if player_sum >= 17 else 1

wins_dqn, losses_dqn, draws_dqn = evaluate(dqn_policy)
wins_bs,  losses_bs,  draws_bs  = evaluate(basic_strategy)

print("DQN:            Wins:", wins_dqn, "Losses:", losses_dqn, "Draws:", draws_dqn)
print("Basic Strategy: Wins:", wins_bs, "Losses:", losses_bs, "Draws:", draws_bs)
print("Winrates:", wins_dqn / (wins_dqn+losses_dqn+draws_dqn),
      wins_bs / (wins_bs+losses_bs+draws_bs))

DQN:            Wins: 21352 Losses: 23951 Draws: 4697
Basic Strategy: Wins: 20307 Losses: 24417 Draws: 5276
Winrates: 0.42704 0.40614
