In [182]:
import gymnasium as gym
import numpy as np
from collections import defaultdict
from typing import Any

env = gym.make("Blackjack-v1")

In [183]:
# Definition of a random starting policy
example_policy = defaultdict(lambda: env.action_space.sample())

In [184]:
# Example execution loop
done = False
total_reward = 0

observation, _ = env.reset()

while not done:
    action = example_policy[observation]

    observation, reward, terminated, truncated, info = env.step(action)
    total_reward += reward

    done = terminated or truncated

total_reward

1.0

In [185]:
# Some utils
def generate_episode(env: gym.Env, policy: dict[Any,Any]):
    done = False
    observation, _ = env.reset()
    tuples = []
    while not done:
        action = policy[observation]
        observation_, reward, terminated, truncated, info = env.step(action)

        tuples.append((observation, int(action), reward))

        observation = observation_
        done = terminated or truncated
    
    return tuples

def generate_episode_exploring_starts(env: gym.Env, policy):
    env.reset()
    env.unwrapped.s = env.observation_space.sample()

    initial_state = env.unwrapped.s
    initial_action = env.action_space.sample()

    tuples = []
    state = initial_state
    action = initial_action

    while True:
        next_state, reward, terminated, truncated, _ = env.step(action)
        tuples.append((state,action,reward))

        if terminated or truncated:
            break

        state = next_state
        action = policy[state]

    return tuples

In [186]:
def first_visit_MC_prediction(env: gym.Env, policy, episodes:int=10_000, gamma: float = 0.9):
    values = defaultdict(float)
    counts = defaultdict(int)

    for _ in range(episodes):
        episode = generate_episode(env, policy)
        g = 0

        # Pre-calculate first indices
        first_visit_idx = {}
        for idx, (state,_,_) in enumerate(episode):
            if state not in first_visit_idx:
                first_visit_idx[state] = idx

        for i in range(len(episode) -1, -1, -1):
            state, action, reward = episode[i]
            g = gamma * g + reward

            # Checks if this is truly first visit
            if i == first_visit_idx.get(state):
                counts[state] += 1

                # Avg mean update, like in multi armed bandits
                values[state] += (g - values[state]) / counts[state] 

    return values

first_visit_MC_prediction(env, example_policy)

defaultdict(float,
            {(15, 10, 0): -0.7855927051671727,
             (19, 9, 0): 0.3098591549295776,
             (21, 9, 1): -0.6455172413793104,
             (13, 2, 0): -0.41538461538461535,
             (6, 2, 0): -0.14294117647058827,
             (21, 8, 0): -1.0,
             (19, 8, 0): -0.9873015873015875,
             (15, 8, 0): -0.9130379746835443,
             (19, 2, 0): -0.9016666666666664,
             (14, 2, 0): -0.6563013698630137,
             (20, 9, 0): -0.9938053097345133,
             (16, 2, 0): -0.1578947368421053,
             (12, 2, 0): -0.4028985507246377,
             (17, 1, 0): -0.6582278481012654,
             (10, 1, 0): -0.2337,
             (7, 1, 0): -0.26505,
             (8, 10, 0): -0.5698924731182793,
             (19, 10, 0): -0.06415094339622632,
             (20, 6, 0): 0.7304347826086958,
             (20, 6, 1): -0.13153846153846152,
             (15, 1, 0): -0.5574444444444444,
             (17, 2, 0): -0.14117647058823532,
    

In [None]:
# Exploring Starts and following GPI: Evaluation -> Improvement
def monte_carlo_ES(env: gym.Env, episodes:int=10_000, gamma: float = 0.9):
    policy = defaultdict(lambda: env.action_space.sample())
    q_values = defaultdict(float)
    counts = defaultdict(int)

    for _ in range(episodes):
        episode = generate_episode_exploring_starts(env, policy)
        g = 0

        # Pre-calculate first indices
        first_visit_idx = {}
        for idx, (state,action,_) in enumerate(episode):
            if (state,action) not in first_visit_idx:
                first_visit_idx[(state,action)] = idx

        for i in range(len(episode) -1, -1, -1):
            state, action, reward = episode[i]
            g = gamma * g + reward

            # Checks if this is truly first visit
            if i == first_visit_idx.get((state,action)):
                counts[(state,action)] += 1

                # Avg mean update, like in multi armed bandits
                q_values[(state,action)] += (g - q_values[(state,action)]) / counts[(state,action)] 

                # Policy improvement step
                max_a = env.action_space.sample()
                max_q_val = q_values[(state,max_a)]
                for a in range(env.action_space.n):
                    if q_values[(state,a)] > max_q_val:
                        max_q_val = q_values[(state,a)]
                        max_a = a

                policy[state] = max_a

    return policy

monte_carlo_ES_optimized_policy = monte_carlo_ES(env, episodes=500_000, gamma=1.0)

In [188]:
# Visualization of the policy
for player_sum in range(21, 11, -1):
    for dealer_card in range(1, 11):
        state = (player_sum, dealer_card, False)
        print(monte_carlo_ES_optimized_policy[state], end=" ")
    print(f" <- {player_sum}")

0 0 0 0 0 0 0 0 0 0  <- 21
0 0 0 0 0 0 0 0 0 0  <- 20
0 0 0 0 0 0 0 0 0 0  <- 19
0 0 0 0 0 0 0 0 0 0  <- 18
0 0 0 0 0 0 0 0 0 0  <- 17
0 0 0 0 0 0 0 0 1 0  <- 16
1 0 0 0 0 0 1 1 1 0  <- 15
1 0 0 0 0 0 1 1 1 1  <- 14
0 0 0 0 0 0 0 1 0 0  <- 13
1 0 0 0 0 0 1 1 1 1  <- 12


In [None]:
# We benchmark policies
def get_avg_reward(env: gym.Env, policy, iterations=10_000):
    total_reward = 0
    for _ in range(iterations):
        done = False
        observation, _ = env.reset()
        while not done:
            action = policy[observation]
            observation, reward, terminated, truncated, info = env.step(action)
            total_reward += reward
            done = terminated or truncated
    return total_reward / iterations

random_policy_avg_reward = get_avg_reward(env, example_policy) 
print(f"Got {random_policy_avg_reward:.3} from random policy")
, gamma=1.0
MC_ES_policy_avg_reward = get_avg_reward(env, monte_carlo_ES_optimized_policy) 
print(f"Got {MC_ES_policy_avg_reward:.3} from monte carlo exploring starts optimized policy")


Got -0.44 from random policy
Got -0.136 from monte carlo exploring starts optimized policy
