<a href="https://colab.research.google.com/github/DManiscalco/Blackjack_RL/blob/main/Blackjack_RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install gymnasium
!pip install stable_baselines3

In [2]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import random
from stable_baselines3 import DQN
from stable_baselines3 import PPO

# First check using a single rule of stopping at a certain level

In [None]:
# Random blackjack game win percentage
def RandomBlackjack(hold_level=16, min_cards_remaining=20, games=1):
  def deal_card():
    dealt_card = shoe.pop()  # this is the card that is dealt; shoe now has one less card in it
    return dealt_card

  def calculate_hand_value(hand):
    non_ace_sum = sum([x for x in hand if x > 1])  # sum of all non-aces
    num_aces = hand.count(1)
    hand_value = non_ace_sum + num_aces

    # See if we are over 21 already
    if hand_value > 21 or num_aces == 0:  # already gone over 21 or if we have no aces to calculate
      return hand_value
    else:  # have not yet gone over 21
      for i in range(num_aces):  # check how many aces can be used to get highest value
        if hand_value + 10 <= 21:
          hand_value += 10
        else:
          return hand_value
      return hand_value

  def initialize_shoe(num_decks=4):
    single_deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10] * 4  # face cards are 10 and ace is 1 or 11
    shoe = single_deck * num_decks  # add specified num of decks
    random.shuffle(shoe)  # randomize the order of the cards for when they are pulled
    return shoe

  # Initialize the game
  decks_used = 4
  shoe = initialize_shoe(decks_used)
  player_wins = 0
  player_draws = 0

  # Start playing the games
  for game in range(games):

    # Check if the shoe needs to be reset when a new game starts
    if len(shoe) <= min_cards_remaining:
      # print('using new shoe')
      shoe = initialize_shoe(decks_used)  # reset the shoe if there are too few cards left

    player_hand = []
    dealer_hand = []
    player_sum = 0
    dealer_sum = 0
    game_result = 0

    # Deal original 2 cards each to player and to dealer
    for i in range(4):
      if i == 0 or i == 2:  # deal to player
        player_hand.append(deal_card())
      else:  # deal to dealer
        dealer_hand.append(deal_card())

    # Calculate the value of the hands
    player_sum = calculate_hand_value(player_hand)
    dealer_sum = calculate_hand_value(dealer_hand)

    while True:

      # Start with player actions
      if player_sum < hold_level:  # hit
        player_hand.append(deal_card())
        player_sum = calculate_hand_value(player_hand)

        if player_sum > 21:  # player busts (goes over 21)
          player_bust = True
          reward = -1
          break

      else:  # stay
        # Move into the dealer actions
        while dealer_sum < player_sum and dealer_sum < 21:
          dealer_hand.append(deal_card())
          dealer_sum = calculate_hand_value(dealer_hand)

        # Game result
        if dealer_sum > 21 or player_sum > dealer_sum:
          reward = 1  # player win
          player_wins += 1
        elif player_sum < dealer_sum:
          reward = -1  # player loss (dealer win)
        else:
          reward = 0  # draw
          player_draws += 1
        break

    # print('\n')
    # print(reward, f'player hand: {player_hand}', f'dealer hand: {dealer_hand}', f'player: {player_sum}', f'dealer: {dealer_sum}')

  # End stats
  print('\n')
  print(f'Games played: {games}')
  print(f'Player win percentage: {player_wins / games}')
  print(f'Player draw percentage: {player_draws / games}')

In [None]:
# Numbers for 500k runs at 16 min level:
# Player win percentage: 0.36778
# Player draw percentage: 0.132958

RandomBlackjack(games=500_000)

  and should_run_async(code)




Games played: 500000
Player win percentage: 0.36778
Player draw percentage: 0.132958


# Check using reinforcement learning model

In [3]:
# Create a class for a player playing blackjack and their cash
class BlackjackPlayer:
  def __init__(self, cash):
    self.cash = cash
    self.ante = 5

  def ante_up(self):
    self.cash -= self.ante
    return self.cash

In [4]:
# Creat a class for the entire environment of the game for an agent to learn from
class BlackjackGame(gym.Env):
  def __init__(self, num_decks=4, min_cards_remaining=20):
    super().__init__()  # reference to the inheritence
    self.num_decks = num_decks  # number of decks in the shoe
    self.min_cards_remaining = min_cards_remaining  # number of cards remaining at which we implement a new shoe
    self.shoe = self.initialize_shoe()  # initialize the shoe of specified number of decks
    self.player_hand = []
    self.dealer_hand = []
    self.player_sum = 0
    self.dealer_sum = 0
    self.player_bust = False  # turns to True if player goes above 21
    self.game_result = 0  # 0 for dealer win; 1 for player win

    self.dealt_cards = []  # all of the cards that have been dealt from the current shoe

    # Define action and observation spaces
    self.action_space = spaces.Discrete(2)  # 0: Stand, 1: Hit (set of actions the agent can take in the game)
    self.observation_space = spaces.Box(low=0, high=52,
                                        shape=(4,),  # [player_sum, dealer_visible_card, has_usable_ace, cards remaining]
                                        dtype=np.int32)

  # create a shoe of a specific number of decks
  def initialize_shoe(self):
    self.single_deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10] * 4  # face cards are 10 and ace is 1 or 11
    self.shoe = self.single_deck * self.num_decks  # add specified num of decks
    random.shuffle(self.shoe)  # randomize the order of the cards for when they are pulled
    return self.shoe

  def deal_card(self):
    self.dealt_card = self.shoe.pop()  # this is the card that is dealt; shoe now has one less card in it
    self.dealt_cards.append(self.dealt_card)  # add the card to total list of dealt cards
    return self.dealt_card

  # For a new game we reset hands but keep the same shoe
  def reset(self, seed=None, options=None):
    super().reset(seed=seed)  # this reset method comes from the inheritence
    self.player_hand = []
    self.dealer_hand = []
    self.player_sum = 0
    self.dealer_sum = 0
    self.player_bust = False  # turns to True if player goes above 21
    self.game_result = 0  # 0 for dealer win; 1 for player win

    # Check if the shoe needs to be reset when a new game starts
    if len(self.shoe) <= self.min_cards_remaining:
      self.shoe = self.initialize_shoe()  # reset the shoe if there are too few cards left

    # Deal original 2 cards each to player and to dealer
    for i in range(4):
      if i == 0 or i == 2:  # deal to player
        self.player_hand.append(self.deal_card())
      else:  # deal to dealer
        self.dealer_hand.append(self.deal_card())

    # Calculate the value of the hands
    self.player_sum = self.calculate_hand_value(self.player_hand)
    self.dealer_sum = self.calculate_hand_value(self.dealer_hand)

    # This relates to the self.observation_space above, specifically for the "shape" section
    observation = np.array([self.player_sum, self.dealer_hand[0], self.has_usable_ace(self.player_hand), len(self.shoe)], dtype=np.int32)
    return observation, {}

  # Taking an action in the environment
  def step(self, action):
    # Start with player actions
    if action == 1:  # hit
      self.player_hand.append(self.deal_card())
      self.player_sum = self.calculate_hand_value(self.player_hand)

      if self.player_sum > 21:  # player busts (goes over 21)
        self.player_bust = True
        reward = -1
        done = True
        # Below is the return of the current observation (remember we can only see dealer's first card); True value indicates game ends
        return np.array([self.player_sum, self.dealer_hand[0], self.has_usable_ace(self.player_hand), len(self.shoe)], dtype=np.int32), reward, done, True, {}

    elif action == 0:  # stay
      # Move into the dealer actions
      while self.dealer_sum < self.player_sum and self.dealer_sum < 21:
        self.dealer_hand.append(self.deal_card())
        self.dealer_sum = self.calculate_hand_value(self.dealer_hand)

      # Game result
      if self.dealer_sum > 21 or self.player_sum > self.dealer_sum:
        reward = 1  # player win
      elif self.player_sum < self.dealer_sum:
        reward = -1  # player loss (dealer win)
      else:
        reward = 0  # draw
      done = True
      # Below is the return of the current observation (remember we can only see dealer's first card)
      return np.array([self.player_sum, self.dealer_hand[0], self.has_usable_ace(self.player_hand), len(self.shoe)], dtype=np.int32), reward, done, True, {}

    else:  # this should not be used
      pass

    # Game is not done, return current state (player has hit but not gone over 21)
    done = False
    reward = 0
    # Below is the return of the current observation (remember we can only see dealer's first card)
    return np.array([self.player_sum, self.dealer_hand[0], self.has_usable_ace(self.player_hand), len(self.shoe)], dtype=np.int32), reward, done, False, {}

  # Function to count the value of a hand, taking aces into account
  def calculate_hand_value(self, hand):
    non_ace_sum = sum([x for x in hand if x > 1])  # sum of all non-aces
    num_aces = hand.count(1)
    hand_value = non_ace_sum + num_aces

    # See if we are over 21 already
    if hand_value > 21 or num_aces == 0:  # already gone over 21 or if we have no aces to calculate
      return hand_value
    else:  # have not yet gone over 21
      for i in range(num_aces):  # check how many aces can be used to get highest value
        if hand_value + 10 <= 21:
          hand_value += 10
        else:
          return hand_value
      return hand_value

  # See if we have an ace that can be used as an 11
  def has_usable_ace(self, hand):
    num_aces = hand.count(1)
    hand_value = sum([x for x in hand if x > 1]) + num_aces

    usable_aces = 0
    for ace in range(num_aces):
      if hand_value + 10 <= 21:
        hand_value += 10
        usable_aces += 1
      else:
        break
    return usable_aces  # can only be 0 or 1

  # Print the current status of the game
  def game_status(self):
    print(f'Player Hand: {self.player_hand} | Player Sum: {self.player_sum}')
    print(f'Dealer Hand: {self.dealer_hand} | Dealer Sum: {self.dealer_sum}')

In [5]:
# Training the RL agent

# Create a game environment
game = BlackjackGame(num_decks=4)

# Create and then train the model based on the blackjack class we created
rl_model = PPO('MlpPolicy', game, verbose=0)
rl_model.learn(total_timesteps=50_000)

# Testing the model that was trained
obs, info = game.reset()
done = False
while not done:
  action, states = rl_model.predict(obs)
  obs, reward, done, truncated, info = game.step(action)
  game.game_status()

  and should_run_async(code)


Player Hand: [1, 5, 9] | Player Sum: 15
Dealer Hand: [10, 3] | Dealer Sum: 13
Player Hand: [1, 5, 9, 10] | Player Sum: 25
Dealer Hand: [10, 3] | Dealer Sum: 13


In [6]:
# Testing the model after training
num_games = 5_000  # Number of games to run for testing
total_reward = 0  # sum of the rewards from all of the games
wins = 0
losses = 0
draws = 0

for i in range(num_games):
    obs, info = game.reset()
    done = False
    while not done:
        action, states = rl_model.predict(obs)
        obs, reward, done, truncated, info = game.step(action)

        total_reward += reward

        # Track wins, losses, and draws for the final reward
        if reward == 1:
            wins += 1
        elif reward == -1:
            losses += 1
        elif reward == 0:
            draws += 1

# Summary of overall wins, losses, and draws after running the test games
print(f'Total Reward over {num_games} games: {total_reward}')
print(f'Win Rate: {wins / num_games}')
print(f'Loss Rate: {losses / num_games}')
print(f'Draw Rate: {draws / num_games}')

  and should_run_async(code)


Total Reward over 5000 games: -806
Win Rate: 0.3488
Loss Rate: 0.51
Draw Rate: 0.682
