In [None]:
# Import all required dependencies

import gymnasium as gym
import ale_py
import numpy as np
#blackjack = gym.make
gym.register_envs(ale_py)
#for key in gym.envs.registry.keys():
#  print(key)

In [2]:
blackjack_env = gym.make("Blackjack-v1", render_mode="human")
observation, info = blackjack_env.reset()

episode_over = False
while not episode_over:
    # randomly sample an action
    action = blackjack_env.action_space.sample()

    # obtain partially observed state, reward, if the state is done, if we've exceeded time, and other info
    # by stepping through the environment using our action
    observation, reward, terminated, truncated, info = blackjack_env.step(action)
    print(episode_over)

    # if our state is terminal, or we have reached the time limit, end
    episode_over = terminated or truncated

# close our environment.
blackjack_env.close()

False


The above cell contains code to render a blackjack environment, and take random actions on it. It doesn't lead to very good performance. Let's change that using a simple RL algorithm.

Let's analyze the environment characteristics and see what we can do.

In [11]:
# Create your environment and needed hyperparameters

blackjack_q_learning_env = gym.make("Blackjack-v1")


print(blackjack_q_learning_env.observation_space[0])
print(blackjack_q_learning_env.observation_space[1])
print(blackjack_q_learning_env.observation_space[2])
print(blackjack_q_learning_env.action_space)
flattened_number_of_states = 32 * 11 * 2
number_of_actions = 2
print(f"Flattened number of states: {flattened_number_of_states}")
print(f"Flattened number of actions: {number_of_actions}")

# initialize Q function
# we changed the way the Q function is initialized so it makes it easier 
# for us to access states
Q_function = np.zeros((32,11,2,number_of_actions))

# set number of epochs
num_epochs = 100000
# set number of timesteps per epoch
num_timesteps_per_epoch = 22


Discrete(32)
Discrete(11)
Discrete(2)
Discrete(2)
Flattened number of states: 704
Flattened number of actions: 2


In [31]:
# load Q-function
Q_function = np.load("model_weights/q_function_blackjack/q_function_blackjack.npy")

In [29]:
# zero the Q-function
Q_function = np.zeros((32,11,2,number_of_actions))

In [36]:
# count Q_function zeros
print(f"Number of zeros in our Q-function: {np.count_nonzero(Q_function == 0)}")

Number of zeros in our Q-function: 916


In [33]:
## main Q-function training loop on the blackjack environment

E = num_epochs
T = num_timesteps_per_epoch
for epoch in range(E):
  timestep = 0
  obs, info = blackjack_q_learning_env.reset()
  # get every state we visited in this epoch:
  list_of_states = []
  while timestep < T:
    list_of_states.append(obs)
    
    #print(f"observation: {obs}")
    random_action = blackjack_q_learning_env.action_space.sample()
    #print(f"random action: {random_action}")
    next_obs, reward, terminated, truncated, info = blackjack_q_learning_env.step(random_action)
    #print(f"reward: {reward}")
    sc0,sc1,sc2 = obs
    # add the reward obtained from each state action pair we encounter to our Q_function
    Q_function[sc0,sc1,sc2,random_action] = reward 
    if terminated or truncated:
      break
    # do some work here to process next_obs
    obs = next_obs
    timestep += 1 # update the timestep to match the changed environment
  # we set the index to be the index before the last element of the list, because we handle
  # the case involving the last element of the list before we get into the while loop.
  reverse_state_list_index = len(list_of_states) - 2
  
  while reverse_state_list_index > -1:
    # update the Q function for all the other, non-terminal states
    sc0,sc1,sc2 = list_of_states[reverse_state_list_index]
    sc0_next,sc1_next,sc2_next = list_of_states[reverse_state_list_index + 1]
    # direct Q function update.
    Q_function[sc0,sc1,sc2] += np.max(Q_function[sc0_next,sc1_next,sc2_next])
    reverse_state_list_index -= 1

np.set_printoptions(threshold=np.inf)
print(f"Done training! Here is the Q-function:\n\n{Q_function}")
print(f"Max value in the Q-function: {np.max(Q_function)}")
print(f"Number of zeros left in our Q-function: {np.count_nonzero(Q_function == 0)}")
  

Done training! Here is the Q-function:

[[[[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]]


 [[[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]]


 [[[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]

  [[ 0.  0.]
   [ 0.  0.]]]


 [[[ 0.  0.]
   [ 0.  0.]]

  

In [35]:
# save Q-function
np.save("model_weights/q_function_blackjack/q_function_blackjack.npy", Q_function)

In [51]:
# We have our learned Q-function. 
# We then instruct our agent to use it on an environment, and see the performance.
blackjack_q_learning_env_eval = gym.make("Blackjack-v1")
learned_q_function = Q_function

episode_over = False
number_of_testing_episodes = 100000
cumulative_reward = 0
negative = 0
positive = 0
zero = 0
for _ in range(number_of_testing_episodes):
    obs, info = blackjack_q_learning_env_eval.reset()
    while not episode_over:
        # randomly sample an action
        action = Q_function[obs[0],obs[1],obs[2]].argmax()

        # obtain partially observed state, reward, if the state is done, if we've exceeded time, and other info
        # by stepping through the environment using our action
        next_obs, reward, terminated, truncated, info = blackjack_q_learning_env_eval.step(action)
        cumulative_reward += reward
        #print(episode_over)

        obs = next_obs

        # if our state is terminal, or we have reached the time limit, end
        episode_over = terminated or truncated
    if reward < 0:
      negative += 1
    elif reward > 0:
      positive += 1
    else:
      zero += 1
# close our environment.
print(f"Cumulative reward: {cumulative_reward}")
print(f"Average cumulative reward: {cumulative_reward/number_of_testing_episodes}")
print(f"Negative: {negative}")
print(f"Positive: {positive}")
print(f"Zero: {zero}")
blackjack_q_learning_env_eval.close()


Cumulative reward: 1.0
Average cumulative reward: 1e-05
Negative: 0
Positive: 100000
Zero: 0
