In [1]:
import gymnasium as gym
import numpy as np
import random

import torch
import torch.nn as nn
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


In [2]:
env = gym.make("Blackjack-v1", natural=True, sab=False)
num_episodes = 200_000

gamma = 0.99

batch_size = 64
buffer_capacity = 100_000

lr = 1e-3

epsilon_start = 1.0
epsilon_end = 0.05
epsilon_decay = 100_000

target_update_freq = 1_000


In [3]:
def state_to_tensor(state):
    player_sum, dealer_card, usable_ace = state

    x = np.array([
        player_sum / 32.0,   
        dealer_card / 10.0,  
        float(usable_ace)
    ], dtype=np.float32)

    return torch.tensor(x, dtype=torch.float32, device=device)

In [4]:
class DQN(nn.Module):
    def __init__(self, input_dim=3, hidden_dim=128, output_dim=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        if x.dim() == 1:
            x = x.unsqueeze(0)
        return self.net(x)

In [5]:
from collections import deque

class ReplayBuffer:
    def __init__(self, capacity=100_000):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)


In [6]:
policy_net = DQN().to(device)
target_net = DQN().to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=lr)
replay_buffer = ReplayBuffer(capacity=buffer_capacity)

In [7]:
def select_action_dqn(state, step):
    epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-1.0 * step / epsilon_decay)

    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        with torch.no_grad():
            s_tensor = state_to_tensor(state)
            q_values = policy_net(s_tensor)
            action = torch.argmax(q_values, dim=1).item()
        return action


In [8]:
def basic_strategy(state):
    player_sum, dealer_card, usable_ace = state
    if player_sum >= 17:
        return 0
    else:
        return 1

In [9]:
def evaluate_policy(policy_fn, n_games=100_000):
    wins = 0
    losses = 0
    draws = 0

    for _ in range(n_games):
        state, _ = env.reset()
        done = False

        while not done:
            action = policy_fn(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            state = next_state

        if reward > 0:
            wins += 1
        elif reward < 0:
            losses += 1
        else:
            draws += 1

    return wins, losses, draws

In [10]:
def dqn_greedy_policy(state):
    with torch.no_grad():
        s_tensor = state_to_tensor(state)
        q_values = policy_net(s_tensor)
        action = torch.argmax(q_values, dim=1).item()
    return action


In [11]:
episode_rewards_history = []
global_step = 0

for episode in range(1, num_episodes + 1):
    state, _ = env.reset()
    done = False
    ep_reward = 0.0

    while not done:
        global_step += 1
        action = select_action_dqn(state, global_step)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        ep_reward += reward
        replay_buffer.push(state, action, reward, next_state, done)

        state = next_state

        if len(replay_buffer) >= batch_size:
            states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
            state_batch = torch.stack([state_to_tensor(s) for s in states])         
            next_state_batch = torch.stack([state_to_tensor(s) for s in next_states])  

            action_batch = torch.tensor(actions, dtype=torch.long, device=device)   
            reward_batch = torch.tensor(rewards, dtype=torch.float32, device=device) 
            done_batch = torch.tensor(dones, dtype=torch.float32, device=device)

            q_values = policy_net(state_batch)                     
            state_action_values = q_values.gather(1, action_batch.unsqueeze(1)).squeeze(1)

            with torch.no_grad():
                next_q_values = target_net(next_state_batch)       
                max_next_q_values, _ = torch.max(next_q_values, dim=1)
                target_values = reward_batch + gamma * max_next_q_values * (1.0 - done_batch)

            loss = nn.functional.mse_loss(state_action_values, target_values)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if global_step % target_update_freq == 0:
                target_net.load_state_dict(policy_net.state_dict())

    episode_rewards_history.append(ep_reward)

    if episode % 10_000 == 0:
        avg_reward = np.mean(episode_rewards_history[-10_000:])
        print(f"Episode {episode}, średnia nagroda z ostatnich 10k epizodów: {avg_reward:.3f}")


Episode 10000, średnia nagroda z ostatnich 10k epizodów: -0.379
Episode 20000, średnia nagroda z ostatnich 10k epizodów: -0.332
Episode 30000, średnia nagroda z ostatnich 10k epizodów: -0.291
Episode 40000, średnia nagroda z ostatnich 10k epizodów: -0.269
Episode 50000, średnia nagroda z ostatnich 10k epizodów: -0.258
Episode 60000, średnia nagroda z ostatnich 10k epizodów: -0.204
Episode 70000, średnia nagroda z ostatnich 10k epizodów: -0.175
Episode 80000, średnia nagroda z ostatnich 10k epizodów: -0.176
Episode 90000, średnia nagroda z ostatnich 10k epizodów: -0.144
Episode 100000, średnia nagroda z ostatnich 10k epizodów: -0.116
Episode 110000, średnia nagroda z ostatnich 10k epizodów: -0.135
Episode 120000, średnia nagroda z ostatnich 10k epizodów: -0.112
Episode 130000, średnia nagroda z ostatnich 10k epizodów: -0.110
Episode 140000, średnia nagroda z ostatnich 10k epizodów: -0.085
Episode 150000, średnia nagroda z ostatnich 10k epizodów: -0.070
Episode 160000, średnia nagroda z 

In [12]:
wins_dqn, losses_dqn, draws_dqn = evaluate_policy(dqn_greedy_policy)
wins_bs, losses_bs, draws_bs = evaluate_policy(basic_strategy)

print("DQN:           Wins:", wins_dqn, "Losses:", losses_dqn, "Draws:", draws_dqn)
print("BasicStrategy: Wins:", wins_bs,  "Losses:", losses_bs,  "Draws:", draws_bs)


DQN:           Wins: 43002 Losses: 48301 Draws: 8697
BasicStrategy: Wins: 40752 Losses: 48719 Draws: 10529
