# Blackjack using Q-learning

Now I want to use the original Q-learning algorithm to learn the optimal policy to play blackjack

In [1]:
import gymnasium as gym
import random
import collections
from tqdm import tqdm
env = gym.make('Blackjack-v1')

In [2]:
class QLearnAgent:
    def __init__(self, lr, discount_factor, eps):
        self.state = None
        self.qtable = collections.defaultdict(float)
        self.alpha = lr
        self.gamma = discount_factor
        self.eps = eps

        self.action_space = [0, 1]
        self.frozen = False
    

    def freeze(self, b = True):
        self.frozen = b

    def reset(self, state):
        self.state = state
    
    def select_action(self):
        if random.uniform(0, 1) < self.eps:
            # pick random action
            return random.choice(self.action_space)
        else:
            #pick best action in current state
            return self.get_best_action(self.state)
    
    def get_best_action(self, state):
        qs = [self.qtable[(state, action)] for action in self.action_space]
        action = qs.index(max(qs))
        return action

    
    def update_q_table(self, reward, new_state, action):
        if not self.frozen:
            best_action = self.get_best_action(new_state)
            self.qtable[(self.state, action)] += self.alpha * (reward + self.gamma * self.qtable[(new_state, best_action)] - self.qtable[self.state, action])

    def step(self):
        action = self.select_action()
        observation, reward, terminated, truncated, _ = env.step(action)
        if not self.frozen:
            self.update_q_table(reward, observation, action)
        self.state = observation
        return terminated, truncated, reward
    
    def play_game(self):
        state, _ = env.reset()
        self.reset(state)
        terminated, truncated = False, False
        while not terminated and not truncated:
            terminated, truncated, reward = self.step()
        if terminated or truncated:
            if reward > 0:
                return 'won'
            elif reward < 0:
                return 'lost'
            else:
                return 'draw'

q_learn_agent = QLearnAgent(lr = 0.1, discount_factor=0.1, eps=0.05)

In [3]:
def get_win_rate(agent):
    agent.freeze()
    outcomes = []
    for _ in range(100000):
        outcomes.append(agent.play_game())
    return 100 * sum([1 if outcomes[i] == 'won' else 0 for i in range(len(outcomes))]) / len(outcomes)

In [4]:
print(f'The agent BEFORE training has a winrate of {get_win_rate(q_learn_agent)}%')

The agent BEFORE training has a winrate of 38.485%


In [5]:
q_learn_agent.freeze(False)
for _ in tqdm(range(2000000)):
    q_learn_agent.play_game()

100%|██████████| 2000000/2000000 [01:05<00:00, 30750.16it/s]


In [6]:
print(f'The agent AFTER training has a winrate of {get_win_rate(q_learn_agent)}%')

The agent AFTER training has a winrate of 41.498%


# Simple Agent

I am going to implement a very simple rule-based strategy to see how it performs in comparison

In [7]:
class RuleBasedAgent:
    def __init__(self):
        self.state = None
    
    def freeze(self):
        return 0

    def reset(self, state):
        self.state = state
    
    def select_action(self):
        player_sum, dealer_card, usable_ace = self.state
        # Always hit if under 12
        if player_sum <= 11:
            return 1  # hit
        # Stand on 17 or higher
        if player_sum >= 17:
            return 0  # stick
        # Between 12 and 16
        if 12 <= player_sum <= 16:
            if dealer_card >= 7:
                return 1  # hit
            else:
                return 0  # stick
        return 0
    
    def step(self):
        action = self.select_action()
        observation, reward, terminated, truncated, _ = env.step(action)
        self.state = observation
        return terminated, truncated, reward
    
    def play_game(self):
        state, _ = env.reset()
        self.reset(state)
        terminated, truncated = False, False
        while not terminated and not truncated:
            terminated, truncated, reward = self.step()
        if terminated or truncated:
            if reward > 0:
                return 'won'
            elif reward < 0:
                return 'lost'
            else:
                return 'draw'

rule_based_agent = RuleBasedAgent()

In [8]:
print(f'The simple agent has a winrate of {get_win_rate(rule_based_agent)}%')

The simple agent has a winrate of 42.51%


In [9]:
env.close()