# Blackjack using Q-learning

Now I want to use the original Q-learning algorithm to learn the optimal policy to play blackjack

In [7]:
import gymnasium as gym
import random
import collections
from tqdm import tqdm

In [8]:
class Agent:
    def __init__(self, lr, discount_factor, eps):
        self.state = None
        self.qtable = collections.defaultdict(int)
        self.alpha = lr
        self.gamma = discount_factor
        self.eps = eps

        self.action_space = [0, 1]
        self.frozen = False
    

    def freeze(self, b = True):
        self.frozen = b

    def reset(self, state):
        self.state = state
    
    def select_action(self):
        if random.uniform(0, 1) < self.eps:
            # pick random action
            return random.choice(self.action_space)
        else:
            #pick best action in current state
            return self.get_best_action(self.state)
    
    def get_best_action(self, state):
        qs = [self.qtable[(state, action)] for action in self.action_space]
        action = qs.index(max(qs))
        return action

    
    def update_q_table(self, reward, new_state, action):
        if not self.frozen:
            best_action = self.get_best_action(new_state)
            self.qtable[(self.state, action)] += self.alpha * (reward + self.gamma * self.qtable[(new_state, best_action)] - self.qtable[self.state, action])

    def step(self):
        action = self.select_action()
        observation, reward, terminated, truncated, _ = env.step(action)
        if not self.frozen:
            self.update_q_table(reward, observation, action)
        self.state = observation
        return terminated, truncated, reward
    
    def play_game(self):
        state, _ = env.reset()
        self.reset(state)
        terminated, truncated = False, False
        while not terminated and not truncated:
            terminated, truncated, reward = agent.step()
        if terminated or truncated:
            if reward > 0:
                return 'won'
            else:
                return 'lost'

agent = Agent(0.1, 0.1, 0.05)

In [10]:
env = gym.make('Blackjack-v1')
for _ in tqdm(range(1000000)):
    agent.play_game()
env.close()

100%|██████████| 1000000/1000000 [00:32<00:00, 30412.65it/s]


In [11]:
env = gym.make('Blackjack-v1', render_mode = 'human')
agent.freeze()
for _ in range(10):
    print(agent.play_game())

env.close()

won
won
won
lost
won
won
lost
lost
lost
lost
