In [56]:
# Some imports, utils and boilerplate

import gymnasium as gym
import numpy as np
from collections import defaultdict
import math

env = gym.make("Blackjack-v1")

random_policy = defaultdict(lambda: {env.action_space.sample(): 1.0})

def get_action_from_policy(policy: dict[Any, dict[Any, float]], state: Any) -> float:
    return np.random.choice(a=list(policy[state].keys()),p=list(policy[state].values()))

actions = [0, 1]
def generate_epsilon_soft_policy_on_state(epsilon: float):
    policy_on_state = {
        action: epsilon / len(actions) for action in actions
    }
    policy_on_state[np.random.choice(actions)] += 1 - epsilon

    return policy_on_state

def q_argmax_e_soft(q_values: dict[Any, float], state: Any, epsilon: float):
    policy_on_state = {
        action: epsilon / len(actions) for action in actions
    }

    max_a = actions[0]
    max_q = -math.inf
    for i in actions: 
        if q_values[i] > max_q:
            max_q = q_values[i]
            max_a = i

    policy_on_state[max_a] += 1 - epsilon

    return policy_on_state

def q_max(q_values: dict[Any, float], state: Any):
    return max(q_values[state, a] for a in actions)

def get_avg_reward(env: gym.Env, policy, iterations=10_000):
    total_reward = 0
    for _ in range(iterations):
        done = False
        observation, _ = env.reset()
        while not done:
            action = get_action_from_policy(policy, observation)
            observation, reward, terminated, truncated, info = env.step(action)
            total_reward += reward
            done = terminated or truncated
    return total_reward / iterations

In [57]:
# Solving the prediction problem
def prediction_td(policy, gamma=0.9, step_size=0.1, episodes = 1_000):
    V = defaultdict(float)

    for episode in range(episodes):
        done = False
        state, _ = env.reset()
        while not done:
            action = get_action_from_policy(policy, state)
            state_, reward, terminated, truncated, info = env.step(action)

            # Update rule
            V[state] += step_size * (reward + gamma * V[state_] - V[state])

            state = state_
            done = terminated or truncated

    return V

prediction_td(random_policy)

defaultdict(float,
            {(16, 5, 1): 0.0,
             (19, 5, 1): -0.1,
             (8, 1, 0): -0.490099501,
             (16, 1, 0): -0.8648275251635912,
             (12, 7, 1): 0.0,
             (21, 7, 1): 0.490099501,
             (16, 10, 0): -0.7247294447581705,
             (10, 6, 0): 0.0011914199999999986,
             (17, 6, 0): -0.3378509311481712,
             (21, 6, 0): -0.271,
             (31, 6, 0): 0.0,
             (8, 2, 0): -0.29701,
             (18, 8, 0): -0.09892158605989998,
             (16, 6, 0): -0.25860435264090004,
             (22, 6, 0): 0.0,
             (21, 3, 1): 0.6793465209301001,
             (20, 6, 0): 0.7687679241635911,
             (12, 9, 0): -0.343496163524419,
             (14, 9, 0): -0.199406843409609,
             (20, 9, 0): 0.7725530557207991,
             (16, 3, 0): -0.21683562660289077,
             (14, 10, 0): -1.6181685459851018,
             (18, 3, 0): -0.34636590000000006,
             (19, 3, 0): 0.6784050407806

In [58]:
# On policy TD Control
def sarsa(gamma =0.9, step_size = 0.1, epsilon=0.1, episodes=1_000):
    Q = defaultdict(float)
    policy = defaultdict(lambda: generate_epsilon_soft_policy_on_state(epsilon))

    for episode in range(episodes):
        done = False
        state, _ = env.reset()
        while not done:
            action = get_action_from_policy(policy, state)
            state_, reward, terminated, truncated, info = env.step(action)
            action_ = get_action_from_policy(policy, state_)

            # Update rule
            Q[(state, action)] += step_size * (reward + gamma * Q[(state_, action_)] - Q[(state, action)])
            policy[state] = q_argmax_e_soft(Q, state, epsilon)

            state = state_
            done = terminated or truncated

    return policy

sarsa_optimized_policy = sarsa(gamma=1, step_size=0.01, episodes=1_000_000)

In [59]:
# Off policy TD Control
def q_learning(gamma =0.9, step_size = 0.1, epsilon=0.1, episodes=1_000):
    Q = defaultdict(float)
    policy = defaultdict(lambda: generate_epsilon_soft_policy_on_state(epsilon))

    for episode in range(episodes):
        done = False
        state, _ = env.reset()
        while not done:
            action = get_action_from_policy(policy, state)
            state_, reward, terminated, truncated, info = env.step(action)
            action_ = get_action_from_policy(policy, state_)

            # Update rule
            Q[(state, action)] += step_size * (reward + gamma * q_max(Q, state_) - Q[(state, action)])
            policy[state] = q_argmax_e_soft(Q, state, epsilon)

            state = state_
            done = terminated or truncated

    return policy

q_learning_optimized_policy = q_learning(gamma=1, step_size=0.01, episodes=1_000_000)


In [60]:
random_policy_avg_reward = get_avg_reward(env, random_policy) 
print(f"Got {random_policy_avg_reward:.3} from random policy")

sarsa_avg_reward = get_avg_reward(env, sarsa_optimized_policy) 
print(f"Got {sarsa_avg_reward:.3} from SARSA optimized policy")

q_learning_avg_reward = get_avg_reward(env, q_learning_optimized_policy) 
print(f"Got {q_learning_avg_reward:.3} from Q-learning optimized policy")

Got -0.278 from random policy
Got -0.203 from SARSA optimized policy
Got -0.194 from Q-learning optimized policy
