In [1]:
import gymnasium as gym
import numpy as np
from matplotlib import pyplot
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict


env = gym.make('Blackjack-v1', sab=True)

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)

# all the policies 

def better_hc_policy(obs):
    player_sum, dealer_sum, usable_ace = obs
    if player_sum < 12:
        return 1
    elif player_sum == 12 and (dealer_sum in [2, 3, 7, 8, 9, 10]):
        return 1
    elif player_sum in [13, 14, 15, 16] and (dealer_sum in [7, 8, 9, 10]):
        return 1
    else:
        return 0

def anotha_one(obs):
    score, dealer_score, usable_ace = obs
    return 0 if score >= 20 else 1

def epsilon_greedy(Q_func, S, epsilon):
    r = np.random.random()
    if r > epsilon:
        return np.argmax(Q_func[S])
    else:
        return np.random.choice([0, 1])

In [1]:
def generate_episode(policy, env, Q_func, N_s, _gamma, _lambda):
    N_0 = 100
    states, actions, rewards = [], [], []
    observation, info = env.reset()
    
    while True:
        N_s[observation] += 1
        epsilon = N_0/(N_0 + N_s[observation])
        action = policy(Q_func, observation, epsilon)
        states.append(observation)
        actions.append(action)
        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)

        TD_delta = rewards[-1] + _gamma * Q_func[states[-1]][actions[-1]] - Q_func[states[-2]][actions[-2]]

        E_trace = E_trace * _gamma * _lambda
        E_trace[states[-2]][actions[-2]] += 1
        
        if terminated or truncated:
            break

    return states, actions, rewards

def Sarsa(policy, env, n_ep):
    Q_func = defaultdict(lambda: [0] * 2)
    N = defaultdict(lambda: [0] * 2)
    N_s = defaultdict(int)
    E_trace = defaultdict(lambda: [0] * 2)
    _gamma = 0.95
    _lambda = 0.9
    
    for _ in range(n_ep):
        E = defaultdict(lambda: [0] )
        
        N_0 = 100
        states, actions, rewards = [], [], []
        observation, info = env.reset()
        
        while True:
            N_s[observation] += 1
            epsilon = N_0/(N_0 + N_s[observation])
            action = policy(Q_func, observation, epsilon)
            states.append(observation)
            actions.append(action)
            observation, reward, terminated, truncated, info = env.step(action)
            rewards.append(reward)
    
            TD_delta = rewards[-1] + _gamma * Q_func[states[-1]][actions[-1]] - Q_func[states[-2]][actions[-2]]
    
            for S in states:
                E_trace[S] = _gamma * _lambda * E_trace[S] 
            
            E_trace[states[-2]][actions[-2]] += 1
            
            if terminated or truncated:
                break

            S = states[i]
            R = rewards[i]
            A = actions[i]

            if S not in states[:i]:
                N[S][A] += 1
                step_size = 1 / (N[S][A])
                Q_func[S][A] += step_size * _gamma * E_trace[S][A]

    return Q_func