In [22]:
import gymnasium as gym
import numpy as np
import random
from collections import defaultdict

# Definicje

In [23]:
env = gym.make("Blackjack-v1", natural = True, sab=False)

# Hiperparametry
alpha = 0.5          # Dużo niższy start! Nie chcemy, żeby jedno rozdanie zmieniało całą wiedzę.
alpha_min = 0.01     # W Blackjacku warto zostawić mały poziom uczenia na stałe.
alpha_decay = 0.99999 # Wolniejszy spadek, żeby zdążył przejść przez 200k epizodów.

gamma = 0.95         

epsilon = 1.0        # STARTUJEMY OD 1. Agent na początku musi grać losowo.
epsilon_min = 0.05   
epsilon_decay = 0.99998 # Epsilon musi maleć, żeby agent zaczął stosować to, co umie.

num_episodes = 200_000 

In [24]:
Q = defaultdict(lambda: np.zeros(env.action_space.n))  # Q[state][action]

# Pomocnicze funkcje

In [25]:
def basic_strategy(state):
    player_sum, dealer_card, usable_ace = state
    if player_sum >= 17:
        return 0  # stick
    else:
        return 1  # hit

def choose_action(state):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(Q[state])
    
def q_policy(state):
    return np.argmax(Q[state])


In [26]:
def evaluate_policy(policy_fn, n_games=100_000):
    wins = 0
    losses = 0
    draws = 0

    for _ in range(n_games):
        state, _ = env.reset()
        done = False
        while not done:
            action = policy_fn(state)
            state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
        if reward > 0:
            wins += 1
        elif reward < 0:
            losses += 1
        else:
            draws += 1
    return wins, losses, draws

# Trenowanie agenta

In [27]:
for episode in range(num_episodes):
    state, _ = env.reset()
    done = False

    while not done:
        action = choose_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)

        if terminated:
            # Jeśli to koniec, cel to po prostu nagroda
            target = reward
        else:
            # Jeśli gra trwa, cel to nagroda + szacowana przyszłość
            target = reward + gamma * np.max(Q[next_state])

        # Aktualizacja o różnicę między celem a obecną wiedzą
        Q[state][action] += alpha * (target - Q[state][action])

        state = next_state
        done = terminated or truncated

    # Wygaszanie parametrów po każdym epizodzie
    alpha = max(alpha_min, alpha * alpha_decay)
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

# Porównanie z klasyczną

In [28]:
wins_q, losses_q, draws_q = evaluate_policy(q_policy)
wins_bs, losses_bs, draws_bs = evaluate_policy(basic_strategy)

print("Q-learning: Wins:", wins_q, "Losses:", losses_q, "Draws:", draws_q)
print("Basic Strategy: Wins:", wins_bs, "Losses:", losses_bs, "Draws:", draws_bs)

Q-learning: Wins: 42314 Losses: 48394 Draws: 9292
Basic Strategy: Wins: 40818 Losses: 48720 Draws: 10462
