In [1]:
import gymnasium as gym
import numpy as np
import random
from collections import defaultdict

# Definicje

In [2]:
env = gym.make("Blackjack-v1", natural = True, sab=False)

# Hiperparametry
num_episodes = 200_000
gamma = 1.0  # discount factor
epsilon = 0.1 # eksploracja

In [3]:
Q = defaultdict(lambda: np.zeros(env.action_space.n))
returns_count = defaultdict(lambda: np.zeros(env.action_space.n))

# Pomocnicze funkcje

In [4]:
def basic_strategy(state):
    player_sum, dealer_card, usable_ace = state
    if player_sum >= 17:
        return 0  # stick
    else:
        return 1  # hit

def choose_action(state):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(Q[state])
    

def mc_policy(state):
    return np.argmax(Q[state])


In [5]:
def evaluate_policy(policy_fn, n_games=100_000):
    wins = 0
    losses = 0
    draws = 0

    for _ in range(n_games):
        state, _ = env.reset()
        done = False
        while not done:
            action = policy_fn(state)
            state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
        if reward > 0:
            wins += 1
        elif reward < 0:
            losses += 1
        else:
            draws += 1
    return wins, losses, draws

# Trenowanie agenta

In [6]:
for episode in range(num_episodes):
    episode_memory = []
    state, _ = env.reset()
    done = False

    while not done:
        action = choose_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        episode_memory.append((state, action, reward))
        state = next_state

    # First-Visit Monte Carlo
    G = 0
    visited = set()
    for state, action, reward in reversed(episode_memory):
        G = reward + gamma * G
        
        if (state, action) not in visited:
            returns_count[state][action] += 1
            Q[state][action] += (G - Q[state][action]) / returns_count[state][action]
            visited.add((state, action))
            
    
    

# Porównanie z klasyczną

In [7]:
wins_q, losses_q, draws_q = evaluate_policy(mc_policy)
wins_bs, losses_bs, draws_bs = evaluate_policy(basic_strategy)

print("Monte carlo: Wins:", wins_q, "Losses:", losses_q, "Draws:", draws_q)
print("Basic Strategy: Wins:", wins_bs, "Losses:", losses_bs, "Draws:", draws_bs)

Monte carlo: Wins: 42617 Losses: 48319 Draws: 9064
Basic Strategy: Wins: 40548 Losses: 48957 Draws: 10495
