This was written using Python 3.11.3 as kernel. 
Installation of the Open AI Gym package is necessary (run "pip install gym; pip install gym[atari]").

In [1]:
import numpy as np
import gym
from tqdm import tqdm, trange

In [2]:
env = gym.make('Blackjack-v1')

The next two cells can be used to manually play the game. env.reset() resets the game and gives a tuple containing the player hand, the dealers card up, and if the player has a usable ace. env.step Lets you take an action. Using it with argument '1' makes you hit a new card, argument '0' makes you stick. It's output is the same tuple as env.reset(), followed by the reward for last action, i.e. -1 for an action that made you lose (busting or sticking but having less than the dealer), 0 if you tie after sticking or not busting after hitting, and +1 for winning (sticking without busting but still having more than the dealer).

In [8]:
env.reset()

((10, 8, False), {})

In [9]:
env.step(1)

((18, 8, False), 0.0, False, False, {})

From here on, the Reinforcement Learning (RL) agent is implemented.

In [3]:
# Define the state space
player_sum_space = range(4, 22)  # possible player hand values (4, 5, ..., 20, 21)
dealer_card_space = range(1, 11)  # possible dealer up card values (2, 3, ..., 10, 11)
usable_ace_space = [False, True]  # whether the player has a usable ace

state_space = []
for player_sum in player_sum_space:
    for dealer_card in dealer_card_space:
        for usable_ace in usable_ace_space:
            state_space.append((player_sum, dealer_card, usable_ace))

# This creates an array of length 306, which is the number of possible states in the game of blackjack. 
# Each state is a tuple of the player’s sum, the dealer’s card, and whether the player has a usable ace.

# Define the action space
action_space = [0, 1]  # hit or stick

# Get the size of the state and action spaces
num_states = len(state_space)
num_actions = len(action_space)

In [5]:
# Initialize the Q-table with initial values. 
Q_table_init = np.zeros((num_states, num_actions))

for i in trange(10000):
    # Initialize the state
    state = env.reset()[0]
    done = False

    while not done:
        # Choose an action
        action = np.random.choice(action_space)

        # Take the action
        next_state, reward, done, terminal, dic = env.step(action)

        # Update Q-table
        Q_table_init[state_space.index(state)][action] += reward

        # Update state
        state = next_state

100%|██████████| 10000/10000 [00:01<00:00, 7292.19it/s]


In [6]:
### Retrain the agent. Takes about 15 seconds to run on my machine. ###
# Set Q_table
Q_table = Q_table_init

# Hyperparameters
num_episodes = 100000  # Total number of episodes
alpha = 0.075  # Learning rate
gamma = 0.90  # Discount factor
epsilon = 0.1  # Epsilon-greedy parameter

# Training loop
for episode in trange(num_episodes):
    state = env.reset()[0]
    done = False
    while not done:
        # Epsilon-greedy action selection
        if np.random.random() < epsilon:
            action = np.random.choice(action_space)
        else:
            action = np.argmax(state_space.index(state))

        # Take the action
        next_state, reward, done, terminal, dic = env.step(action)

        # Update Q-value if you busted, necessary because Q_table(next_state) does not exist if you bust
        if next_state not in state_space:
            Q_table[state_space.index(state)][action] += alpha * (reward - Q_table[state_space.index(state)][action])
            break

        # Update Q-value for current state-action pair
        Q_table[state_space.index(state)][action] += alpha * (reward + gamma * np.max(Q_table[state_space.index(next_state)]) - Q_table[state_space.index(state)][action])

        state = next_state

100%|██████████| 100000/100000 [00:13<00:00, 7175.34it/s]


In [18]:
### Make the agent play many games and check winnning rate. Takes about 2 minutes and 5 seconds to run on my machine ###

num_games = 1000000
num_wins = 0
num_draws = 0
num_losses = 0

for i in trange(num_games):
    state = env.reset()[0]
    done = False
    while not done:
        action = np.argmax(Q_table[state_space.index(state)])
        next_state, reward, done, terminal, dic = env.step(action)
        state = next_state
        if done and reward == 1:
            num_wins += 1
        elif done and reward == 0:
            num_draws += 1
        elif done and reward == -1:
            num_losses += 1

num_wins/(num_games-num_draws) # Winrate excluding draws

100%|██████████| 1000000/1000000 [02:06<00:00, 7890.38it/s]


0.4309124974614634

In [15]:
# Check winnning rate of random action policy
# This has been run on a large amount of games so it is a good estimate of the win rate. Don't run it again, it takes a long time.

num_games = 1000000
num_wins = 0
num_draws = 0
num_losses = 0

for i in trange(num_games):
    state = env.reset()[0]
    done = False
    while not done:
        action = np.random.choice(action_space)
        next_state, reward, done, terminal, dic = env.step(action)
        state = next_state
        if done and reward == 1:
            num_wins += 1
        elif done and reward == 0:
            num_draws += 1
        elif done and reward == -1:
            num_losses += 1

num_wins/(num_games-num_draws) # Winrate excluding draws

100%|██████████| 1000000/1000000 [02:05<00:00, 7940.03it/s]


0.29292192864980326

In [12]:
# Look for winrates with different hyperparameters, should run in about 60 minutes on my machine

# Hyperparameter ranges
num_episodes = 100000  # Total number of episodes
num_games = 100000 # Number of games to play to check winrate
alphas = np.arange(0.02,0.12,0.02)  # Learning rate
gammas = np.arange(0.89,1.01,0.02)  # Discount factor
epsilons = np.arange(0.05,0.35,0.05)  # Epsilon-greedy parameter

winrates=np.zeros((len(alphas),len(gammas),len(epsilons)))

# Training loop
for ii in trange(len(alphas)):
    alpha=alphas[ii]
    for jj in range(len(gammas)):
        gamma=gammas[jj]
        for kk in range(len(epsilons)):
            epsilon=epsilons[kk]
            # Set Q_table
            Q_table = Q_table_init

            for episode in range(num_episodes):
                state = env.reset()[0]
                done = False
                while not done:
                    # Epsilon-greedy action selection
                    if np.random.random() < epsilon:
                        action = np.random.choice(action_space)
                    else:
                        action = np.argmax(state_space.index(state))

                    # Take the action
                    next_state, reward, done, terminal, dic = env.step(action)

                    # Update Q-value if you busted, necessary because Q_table(next_state) does not exist if you bust
                    if next_state not in state_space:
                        Q_table[state_space.index(state)][action] += alpha * (reward - Q_table[state_space.index(state)][action])
                        break

                    # Update Q-value for current state-action pair
                    Q_table[state_space.index(state)][action] += alpha * (reward + gamma * np.max(Q_table[state_space.index(next_state)]) - Q_table[state_space.index(state)][action])

                    state = next_state
            num_games = 100000
            num_wins = 0
            num_draws = 0
            num_losses = 0

            for i in range(num_games):
                state = env.reset()[0]
                done = False
                while not done:
                    action = np.argmax(Q_table[state_space.index(state)])
                    next_state, reward, done, terminal, dic = env.step(action)
                    state = next_state
                    if done and reward == 1:
                        num_wins += 1
                    elif done and reward == 0:
                        num_draws += 1
                    elif done and reward == -1:
                        num_losses += 1

            winrates[ii,jj,kk]=num_wins/(num_games-num_draws) # Winrate excluding draws

100%|██████████| 5/5 [1:19:20<00:00, 952.10s/it]


In [13]:
winrates # Open in a text editor to see all the data

array([[[0.3791973 , 0.3800415 , 0.40833786, 0.40786464, 0.40666139,
         0.42847636],
        [0.41961583, 0.40904543, 0.42064424, 0.42810593, 0.42096744,
         0.42712573],
        [0.42123593, 0.41707953, 0.42605964, 0.40270625, 0.42272266,
         0.42665969],
        [0.42035105, 0.42533449, 0.42635075, 0.41500554, 0.42425466,
         0.42863056],
        [0.42488498, 0.40868698, 0.42779173, 0.40948341, 0.42038529,
         0.43365791],
        [0.41835036, 0.41183618, 0.42962012, 0.43863763, 0.43040101,
         0.43872549]],

       [[0.37888426, 0.38675466, 0.38728473, 0.41420336, 0.4292439 ,
         0.4249016 ],
        [0.42667788, 0.4244682 , 0.42601699, 0.43682477, 0.43508541,
         0.42229424],
        [0.42340555, 0.40809392, 0.40164484, 0.43399416, 0.41530001,
         0.40576694],
        [0.40627516, 0.42878945, 0.43295112, 0.41021724, 0.44644604,
         0.40308846],
        [0.4088886 , 0.43767108, 0.44060232, 0.40983695, 0.41708341,
         0.43266454

In [9]:
alphas

array([0.02, 0.04, 0.06, 0.08])