# Blackjack using Q-learning

Now I want to use the original Q-learning algorithm to learn the optimal policy to play blackjack

In [11]:
import gymnasium as gym
import random
import collections
from tqdm import tqdm
env = gym.make('Blackjack-v1')

In [None]:
class Agent:
    def __init__(self, lr, discount_factor, eps):
        self.state = None
        self.qtable = collections.defaultdict(float)
        self.alpha = lr
        self.gamma = discount_factor
        self.eps = eps

        self.action_space = [0, 1]
        self.frozen = False
    

    def freeze(self, b = True):
        self.frozen = b

    def reset(self, state):
        self.state = state
    
    def select_action(self):
        if random.uniform(0, 1) < self.eps:
            # pick random action
            return random.choice(self.action_space)
        else:
            #pick best action in current state
            return self.get_best_action(self.state)
    
    def get_best_action(self, state):
        qs = [self.qtable[(state, action)] for action in self.action_space]
        action = qs.index(max(qs))
        return action

    
    def update_q_table(self, reward, new_state, action):
        if not self.frozen:
            best_action = self.get_best_action(new_state)
            self.qtable[(self.state, action)] += self.alpha * (reward + self.gamma * self.qtable[(new_state, best_action)] - self.qtable[self.state, action])

    def step(self):
        action = self.select_action()
        observation, reward, terminated, truncated, _ = env.step(action)
        if not self.frozen:
            self.update_q_table(reward, observation, action)
        self.state = observation
        return terminated, truncated, reward
    
    def play_game(self):
        state, _ = env.reset()
        self.reset(state)
        terminated, truncated = False, False
        while not terminated and not truncated:
            terminated, truncated, reward = agent.step()
        if terminated or truncated:
            if reward > 0:
                return 'won'
            elif reward < 0:
                return 'lost'
            else:
                return 'draw'

agent = Agent(lr = 0.1, discount_factor=0.1, eps=0.05)

In [13]:
def get_win_rate():
    agent.freeze()
    outcomes = []
    for _ in range(100000):
        outcomes.append(agent.play_game())
    return 100 * sum([1 if outcomes[i] == 'won' else 0 for i in range(len(outcomes))]) / len(outcomes)

print(f'The agent BEFORE training has a winrate of {get_win_rate()}%')

The agent BEFORE training has a winrate of 38.052%


In [14]:
agent.freeze(False)
for _ in tqdm(range(2000000)):
    agent.play_game()

100%|██████████| 2000000/2000000 [01:06<00:00, 30284.06it/s]


In [15]:
print(f'The agent AFTER training has a winrate of {get_win_rate()}%')
env.close()

The agent AFTER training has a winrate of 41.775%
