In [2]:
import gym
import numpy as np
from collections import defaultdict
from tqdm import tqdm

In [6]:

class BlackjackAgent:
    def __init__(
        self,
        learning_rate: float,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        discount_factor: float = 0.95,
    ):
        self.q_values = defaultdict(lambda: np.zeros(2))
        self.lr = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

    def get_action(self, obs) -> int:
        state = obs[0]
        if np.random.random() < self.epsilon:
            return np.random.choice([0, 1])
        else:
            return int(np.argmax(self.q_values[state]))

    def update(self, obs, action: int, reward: float, terminated: bool, next_obs):
        state = obs[0]
        next_state = next_obs[0]
        future_q_value = (not terminated) * np.max(self.q_values[next_state])
        temporal_difference = (
            reward + self.discount_factor * future_q_value - self.q_values[state][action]
        )
        self.q_values[state][action] += self.lr * temporal_difference
    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)