In [6]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import pickle #allows to serialize and deserialize Python objects.
from mpl_toolkits.mplot3d import Axes3D
from tqdm import trange,tqdm #allows to create progress bars
import os

#change the paths to the correct paths on your computer
directory_path_params = r"C:\Users\Callens.R\Documents\Kms\1MA sem 2\2. DS425 Intelligent decision support methods\Project weemaes_callens\DS425-Blackjack\data\params"

In this code the winrate is calculated for a range of reward functions. 

In [2]:
env = gym.make('Blackjack-v1')

In [15]:
# Define the state space
player_sum_space = range(4, 22)  # possible player hand values (4, 5, ..., 20, 21)
dealer_card_space = range(1, 11)  # possible dealer up card values (2, 3, ..., 10, 11)
usable_ace_space = [False, True]  # whether the player has a usable ace

state_space = []
for player_sum in player_sum_space:
    for dealer_card in dealer_card_space:
        for usable_ace in usable_ace_space:
            state_space.append((player_sum, dealer_card, usable_ace))

# This creates an array of length 306, which is the number of possible states in the game of blackjack. 
# Each state is a tuple of the player’s sum, the dealer’s card, and whether the player has a usable ace.

# Define the action space
action_space = [0, 1]  # hit or stick
# Get the size of the state and action spaces
num_states = len(state_space)
num_actions = len(action_space)

This cell calculates the Q-table for the +1 -1 rewards with its optimal hyperparameters. The Q-table has also been given as a .pkl file, this does not need to be runned again.

In [None]:
# Hyperparameters
num_episodes = 5000000  # Total number of episodes
alpha = 0.01  # Learning rate
gamma = 0.2  # Discount factor
epsilon = 0.41  # Epsilon-greedy parameter

Q_table = np.zeros((num_states, num_actions))

# Training loop
for episode in trange(num_episodes):
    state = env.reset()[0]
    done = False
    while not done:
        # Epsilon-greedy action selection
        if np.random.random() < epsilon:
            action = np.random.choice(action_space)
        else:
            action = np.argmax(state_space.index(state))

        # Take the action
        next_state, reward, done, terminal, dic = env.step(action)

        # Update Q-value if you busted, necessary because Q_table(next_state) does not exist if you bust
        if next_state not in state_space:
            Q_table[state_space.index(state)][action] += alpha * (reward - Q_table[state_space.index(state)][action])
            break

        # Update Q-value for current state-action pair
        Q_table[state_space.index(state)][action] += alpha * (reward + gamma * np.max(Q_table[state_space.index(next_state)]) - Q_table[state_space.index(state)][action])

        state = next_state

filename = f"{directory_path_params}/Q_table_10_10_ownhyperparams.pkl"
with open(filename, 'wb') as f:
    pickle.dump(Q_table,f)

This section computes the average winrate (over 100 000 games) of 4 policies, 1023 times each (takes a LONG time!).

In [5]:
# Hyperparameters
alpha = 0.05  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 0.25  # Epsilon-greedy parameter

num_games = 100000
win_rates = np.zeros((1,1024))

# +1.2 -0.8 Q-table agent
filename = f"{directory_path_params}/episodes4999999_q_table_alpha0.01_gamma0.5_epsilon0.25.pkl"
with open(filename, 'rb') as f:
            Q_2 = pickle.load(f)

Q_table=Q_2
for j in trange(0,1023,1):
    num_wins = 0
    num_losses = 0
    num_draws = 0
    for ii in range(num_games):
        state = env.reset()[0]
        done = False
        while not done:
            action = np.argmax(Q_table[state_space.index(state)])
            next_state, reward, done, terminal, dic = env.step(action)
            state = next_state
            if reward == 1:
                num_wins += 1
            elif reward == -1:
                num_losses += 1
            elif done and reward == 0:
                num_draws += 1

    win_rate = num_wins / (num_games - num_draws)
    win_rates[0,j] = win_rate

filename = f"{directory_path_params}/winrate_12_08.pkl"
with open(filename, 'wb') as f:
    pickle.dump(win_rates,f)

# +1.0 -1.0 Q-table agent
filename = f"{directory_path_params}/Q_table_10_10_ownhyperparams.pkl"
with open(filename, 'rb') as f:
            Q_2 = pickle.load(f)

win_rates = np.zeros((1,1024))

Q_table=Q_2
for j in trange(0,1023,1):
    num_wins = 0
    num_losses = 0
    num_draws = 0
    for ii in range(num_games):
        state = env.reset()[0]
        done = False
        while not done:
            action = np.argmax(Q_table[state_space.index(state)])
            next_state, reward, done, terminal, dic = env.step(action)
            state = next_state
            if reward == 1:
                num_wins += 1
            elif reward == -1:
                num_losses += 1
            elif done and reward == 0:
                num_draws += 1

    win_rate = num_wins / (num_games - num_draws)
    win_rates[0,j] = win_rate

filename = f"{directory_path_params}/winrate_10_10.pkl"
with open(filename, 'wb') as f:
    pickle.dump(win_rates,f)


#Random policy
win_rates = np.zeros((1,1024))

for j in trange(0,1023,1):
    num_wins = 0
    num_losses = 0
    num_draws = 0
    for ii in range(num_games):
        state = env.reset()[0]
        done = False
        while not done:
            action = np.random.choice(action_space)
            next_state, reward, done, terminal, dic = env.step(action)
            state = next_state
            if reward == 1:
                num_wins += 1
            elif reward == -1:
                num_losses += 1
            elif done and reward == 0:
                num_draws += 1

    win_rate = num_wins / (num_games - num_draws)
    win_rates[0,j] = win_rate

filename = f"{directory_path_params}/winrate_random.pkl"
with open(filename, 'wb') as f:
    pickle.dump(win_rates,f)

#Fixed policy
win_rates = np.zeros((1,1024))

for j in trange(0,1023,1):
    num_wins = 0
    num_losses = 0
    num_draws = 0
    for ii in range(num_games):
        state = env.reset()[0]
        done = False
        while not done:
            if state[0] < 16:
                action = 1
            else:
                action = 0
            next_state, reward, done, terminal, dic = env.step(action)
            state = next_state
            if reward == 1:
                num_wins += 1
            elif reward == -1:
                num_losses += 1
            elif done and reward == 0:
                num_draws += 1

    win_rate = num_wins / (num_games - num_draws)
    win_rates[0,j] = win_rate

filename = f"{directory_path_params}/winrate_dealer_policy.pkl"
with open(filename, 'wb') as f:
    pickle.dump(win_rates,f)

  0%|          | 0/1023 [00:00<?, ?it/s]


NameError: name 'state_space' is not defined