In [29]:
import gymnasium as gym
import numpy as np
import random
from collections import defaultdict

# Definicje

In [30]:
env = gym.make("Blackjack-v1", natural = True, sab=False)

#Hiperparametry 
alpha = 1            # Zaczynamy od wyższej wartości, żeby szybko łapał podstawy
alpha_min = 0.001    # Nie schodzimy poniżej tego poziomu
alpha_decay = 0.99995 # Współczynnik wygaszania
gamma = 1.0       # discount factor 
epsilon = 0.1     # eksploracja
num_episodes = 200_000

In [31]:
Q = defaultdict(lambda: np.zeros(env.action_space.n))  # Q[state][action]

# Pomocnicze funkcje

In [32]:
def basic_strategy(state):
    player_sum, dealer_card, usable_ace = state
    if player_sum >= 17:
        return 0  # stick
    else:
        return 1  # hit

def choose_action(state):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(Q[state])
    
def q_policy(state):
    return np.argmax(Q[state])


In [33]:
def evaluate_policy(policy_fn, n_games=100_000):
    wins = 0
    losses = 0
    draws = 0

    for _ in range(n_games):
        state, _ = env.reset()
        done = False
        while not done:
            action = policy_fn(state)
            state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
        if reward > 0:
            wins += 1
        elif reward < 0:
            losses += 1
        else:
            draws += 1
    return wins, losses, draws

# Trenowanie agenta

In [34]:
for episode in range(num_episodes):
    state, _ = env.reset()
    done = False

    while not done:
        action = choose_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)

        done = terminated or truncated

        Q[state][action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state][action])

        state = next_state
    alpha = max(alpha_min, alpha * alpha_decay)

# Porównanie z klasyczną

In [35]:
wins_q, losses_q, draws_q = evaluate_policy(q_policy)
wins_bs, losses_bs, draws_bs = evaluate_policy(basic_strategy)

print("Q-learning: Wins:", wins_q, "Losses:", losses_q, "Draws:", draws_q)
print("Basic Strategy: Wins:", wins_bs, "Losses:", losses_bs, "Draws:", draws_bs)

Q-learning: Wins: 38161 Losses: 53337 Draws: 8502
Basic Strategy: Wins: 41220 Losses: 48362 Draws: 10418
