# Set Environment

In [24]:
import gym
from gym import spaces
from gym.utils import seeding
import random

# Full deck with distinct face cards
CARDS = [1, 2, 3, 4, 5, 6, 7, 8, 9, '10', 'J', 'Q', 'K'] * 4

def card_value(card):
    return 10 if card in ['10', 'J', 'Q', 'K'] else card

def draw_card(deck):
    return deck.pop()

def draw_hand(deck):
    return [draw_card(deck), draw_card(deck)]

def usable_ace(hand):
    return 1 in hand and sum(card_value(c) for c in hand) + 10 <= 21

def sum_hand(hand):
    total = sum(card_value(c) for c in hand)
    return total + 10 if usable_ace(hand) else total

def is_bust(hand):
    return sum_hand(hand) > 21

def score(hand):
    return 0 if is_bust(hand) else sum_hand(hand)

def is_natural(hand):
    return set(hand) == {1, '10'} or set(hand) == {1, 'J'} or set(hand) == {1, 'Q'} or set(hand) == {1, 'K'}

def can_double_down(hand, actionstaken):
    return len(hand) == 2 and actionstaken == 0

class BlackjackEnv(gym.Env):
    metadata = {"render.modes": ["human"]}

    def __init__(self, numdecks=4, natural=True):
        super().__init__()
        self.action_space = spaces.Discrete(4)  # 0: Stick, 1: Hit, 2: Double Down, 3: Split
        self.observation_space = spaces.Tuple((
            spaces.Tuple((spaces.Discrete(32), spaces.Discrete(32))),  # Player hand (2 cards)
            spaces.Discrete(11),  # Dealer's showing card
            spaces.Discrete(2),   # Usable ace
            spaces.Discrete(2)    # Can double down
        ))

        self.natural = natural
        self.numdecks = numdecks
        self.decks = CARDS * self.numdecks
        random.shuffle(self.decks)
        self.seed()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        random.seed(seed)
        return [seed]

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        if seed is not None:
            self.seed(seed)

        if self._deck_is_out():
            self.decks = CARDS * self.numdecks
            random.shuffle(self.decks)

        self.dealer = draw_hand(self.decks)
        first_hand = draw_hand(self.decks)
        self.hands = [first_hand]
        self.current_hand = 0
        self.actionstaken = 0
        self.hand_results = []
        return self._get_obs()

    def step(self, action):
        assert self.action_space.contains(action), f"Invalid action: {action}"
        if self._deck_is_out():
            self.decks = CARDS * self.numdecks
            random.shuffle(self.decks)

        done = False
        reward = 0
        hand = self.hands[self.current_hand]

        if action == 0:  # Stick
            self._finalize_current_hand()

        elif action == 1:  # Hit
            hand.append(draw_card(self.decks))
            if is_bust(hand):
                self.hand_results.append(-1)
                self._advance_hand()

        elif action == 2:  # Double Down
            if not can_double_down(hand, self.actionstaken):
                raise ValueError("Invalid double down attempt.")
            hand.append(draw_card(self.decks))
            if is_bust(hand):
                self.hand_results.append(-2)
            else:
                self._finalize_current_hand(double=True)

        elif action == 3:  # Split
            if len(hand) != 2 or hand[0] != hand[1]:
                raise ValueError("Invalid split attempt.")
            card = hand[0]
            self.hands[self.current_hand] = [card, draw_card(self.decks)]
            self.hands.insert(self.current_hand + 1, [card, draw_card(self.decks)])

        self.actionstaken += 1

        if self.current_hand >= len(self.hands):
            while sum_hand(self.dealer) < 17:
                self.dealer.append(draw_card(self.decks))

            if len(self.hand_results) < len(self.hands):
                self._finalize_current_hand()

            reward = sum(self.hand_results)
            done = True
        
        return self._get_obs(), reward, done, {}

    def _finalize_current_hand(self, double=False):
        hand = self.hands[self.current_hand]
        player_score = score(hand)
        dealer_score = score(self.dealer)
        result = float(player_score > dealer_score) - float(player_score < dealer_score)
        if is_natural(hand) and result == 1 and self.natural:
            result = 1.5
        self.hand_results.append(result * (2 if double else 1))
        self._advance_hand()

    def _advance_hand(self):
        self.current_hand += 1
        self.actionstaken = 0

    def _get_obs(self):
        if self.current_hand >= len(self.hands):
            return ((0, 0), self.dealer[0], 0, 0)

        hand = self.hands[self.current_hand]
        padded = hand[:2] + [0] * (2 - len(hand))
        return (
            tuple(card_value(c) if c != 0 else 0 for c in padded[:2]),
            card_value(self.dealer[0]),
            usable_ace(hand),
            can_double_down(hand, self.actionstaken)
        )

    def _deck_is_out(self):
        return len(self.decks) < self.numdecks * len(CARDS) * 0.1

# Set the Simple DQN Model

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque
import copy
import os

# Define the Q-network
class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )
    
    def forward(self, x):
        return self.fc(x)

def preprocess_state(state):
    """
    Converts the BlackjackEnv state to a format usable by the neural network
    State format: ((card1, card2), dealer_card, usable_ace, can_double)
    """
    player_cards, dealer_card, usable_ace, can_double = state
    
    # Convert dealer_card using card_value function
    dealer_value = card_value(dealer_card)
    
    # For player cards tuple, calculate sum using card_value
    player_sum = 0
    for card in player_cards:
        if card != 0:  # Skip zero values (padding)
            player_sum += card_value(card)
    
    return np.array([player_sum, dealer_value, usable_ace, can_double], dtype=np.float32)

# Epsilon-greedy action selection with valid action masking
def select_action(state, q_network, epsilon, env):
    # Extract state components to check for valid actions
    player_cards, dealer_card, usable_ace, can_double = state
    
    # Default valid actions (stick and hit)
    valid_actions = [0, 1]  
    
    # Check if can double down
    if can_double:
        valid_actions.append(2)
    
    # Check if the current hand allows split
    if env.current_hand < len(env.hands):
        current_hand = env.hands[env.current_hand]
        # Check if can split (same card value and exactly 2 cards)
        if len(current_hand) == 2 and card_value(current_hand[0]) == card_value(current_hand[1]):
            valid_actions.append(3)
    
    # Epsilon-greedy action selection
    if random.random() < epsilon:
        return random.choice(valid_actions)  # Choose randomly from valid actions
    else:
        with torch.no_grad():
            state_tensor = torch.FloatTensor(preprocess_state(state)).unsqueeze(0)
            q_values = q_network(state_tensor)
            
            # Mask invalid actions by setting their Q-values to -inf
            for action in range(env.action_space.n):
                if action not in valid_actions:
                    q_values[0, action] = float('-inf')
            
            return q_values.argmax().item()

def train_dqn(env, n_episodes=5000, gamma=0.99, lr=1e-3, batch_size=64,
              epsilon_start=1.0, epsilon_end=0.1, epsilon_decay=0.995,
              model_save_path='best_blackjack_dqn.pth'):

    input_dim = 4  # [player_sum, dealer_card, usable_ace, can_double]
    output_dim = env.action_space.n

    q_network = QNetwork(input_dim, output_dim)
    optimizer = optim.Adam(q_network.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    replay_buffer = deque(maxlen=10000)
    epsilon = epsilon_start
    losses = []

    best_model = None
    best_avg_reward = float('-inf')
    reward_window = deque(maxlen=100)

    for episode in range(n_episodes):
        state = env.reset()
        done = False
        episode_reward = 0

        while not done:
            action = select_action(state, q_network, epsilon, env)
            
            try:
                next_state, reward, done, _ = env.step(action)
                episode_reward += reward
                
                replay_buffer.append((state, action, reward, next_state, done))
                state = next_state
                
            except ValueError:
                # If invalid action is selected, try a different valid action
                valid_actions = [0, 1]  # Default to stick or hit
                action = random.choice(valid_actions)
                next_state, reward, done, _ = env.step(action)
                episode_reward += reward
                
                replay_buffer.append((state, action, reward, next_state, done))
                state = next_state

            # Only train if we have enough samples
            if len(replay_buffer) >= batch_size:
                batch = random.sample(replay_buffer, batch_size)
                states, actions, rewards, next_states, dones = zip(*batch)

                # Preprocess states and next_states
                processed_states = [preprocess_state(s) for s in states]
                processed_next_states = [preprocess_state(s) for s in next_states]

                states_tensor = torch.FloatTensor(np.array(processed_states))
                actions_tensor = torch.LongTensor(actions).unsqueeze(1)
                rewards_tensor = torch.FloatTensor(rewards).unsqueeze(1)
                next_states_tensor = torch.FloatTensor(np.array(processed_next_states))
                dones_tensor = torch.BoolTensor(dones).unsqueeze(1)

                with torch.no_grad():
                    next_q_values = q_network(next_states_tensor).max(1, keepdim=True)[0]
                    targets = rewards_tensor + gamma * next_q_values * (~dones_tensor)

                q_values = q_network(states_tensor).gather(1, actions_tensor)

                loss = loss_fn(q_values, targets)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                losses.append(loss.item())

        # Track rewards for model saving
        reward_window.append(episode_reward)
        
        # Decay epsilon
        epsilon = max(epsilon_end, epsilon * epsilon_decay)

        # Save best model based on average reward
        if len(reward_window) == 100:
            avg_reward = np.mean(reward_window)
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward
                best_model = copy.deepcopy(q_network)
                torch.save({
                    'model_state_dict': best_model.state_dict(),
                    'avg_reward': best_avg_reward,
                    'episode': episode + 1
                }, model_save_path)
                # print(f"✅ Best model saved at episode {episode+1} | Avg Reward: {best_avg_reward:.4f}")

        if (episode + 1) % 10000 == 0:
            print(f"Episode {episode+1} | Epsilon: {epsilon:.4f} | Avg Reward: {np.mean(list(reward_window)):.4f}")

    return best_model if best_model else q_network, losses

In [15]:
# Train models for different deck counts
models = {}

for num_decks in range(1, 7):
    print(f"\n=== Training model for {num_decks} deck(s) ===")
    env = BlackjackEnv(numdecks=num_decks, natural=True)
    model_save_path = f"blackjack_dqn_decks_{num_decks}.pth"
    model, _ = train_dqn(env, n_episodes=10000, model_save_path=model_save_path)
    models[num_decks] = model
    print(f"Completed training for {num_decks} deck(s)")

print("All models trained successfully!")


=== Training model for 1 deck(s) ===
Episode 10000 | Epsilon: 0.1000 | Avg Reward: -0.1400
Completed training for 1 deck(s)

=== Training model for 2 deck(s) ===
Episode 10000 | Epsilon: 0.1000 | Avg Reward: -0.8800
Completed training for 2 deck(s)

=== Training model for 3 deck(s) ===
Episode 10000 | Epsilon: 0.1000 | Avg Reward: -1.1800
Completed training for 3 deck(s)

=== Training model for 4 deck(s) ===
Episode 10000 | Epsilon: 0.1000 | Avg Reward: -1.0900
Completed training for 4 deck(s)

=== Training model for 5 deck(s) ===
Episode 10000 | Epsilon: 0.1000 | Avg Reward: -0.9100
Completed training for 5 deck(s)

=== Training model for 6 deck(s) ===
Episode 10000 | Epsilon: 0.1000 | Avg Reward: -1.2800
Completed training for 6 deck(s)
All models trained successfully!


In [11]:
import torch
import os
import numpy as np
import pandas as pd
import random
import sys
from collections import deque

# Define the Q-network
class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )
    
    def forward(self, x):
        return self.fc(x)

# Load the saved DQN models
dqn_models = {}

for num_decks in range(1, 7):
    print(f"Loading DQN model for {num_decks} deck(s)...")
    model_path = f"blackjack_dqn_decks_{num_decks}.pth"
    
    # Check if the model file exists
    if os.path.exists(model_path):
        # Create a new model instance with the correct architecture
        input_dim = 4  # [player_sum, dealer_card, usable_ace, can_double]
        output_dim = 4  # [stick, hit, double, split]
        model = QNetwork(input_dim, output_dim)
        
        # Load the saved weights
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        
        # Set to evaluation mode
        model.eval()
        
        # Store in the models dictionary
        dqn_models[num_decks] = model
        
        print(f"✅ Successfully loaded model for {num_decks} deck(s) from {model_path}")
        if 'avg_reward' in checkpoint:
            print(f"   Average reward during training: {checkpoint['avg_reward']:.4f}")
        if 'episode' in checkpoint:
            print(f"   Saved at episode: {checkpoint['episode']}")
    else:
        print(f"❌ Model file not found: {model_path}")
        print(f"   Creating a new untrained model for {num_decks} deck(s)")
        
        # Create an untrained model as fallback
        input_dim = 4
        output_dim = 4
        model = QNetwork(input_dim, output_dim)
        dqn_models[num_decks] = model

print("\nAll DQN models loaded successfully!")

Loading DQN model for 1 deck(s)...
✅ Successfully loaded model for 1 deck(s) from blackjack_dqn_decks_1.pth
   Average reward during training: 0.6250
   Saved at episode: 1442
Loading DQN model for 2 deck(s)...
✅ Successfully loaded model for 2 deck(s) from blackjack_dqn_decks_2.pth
   Average reward during training: 0.0050
   Saved at episode: 373
Loading DQN model for 3 deck(s)...
✅ Successfully loaded model for 3 deck(s) from blackjack_dqn_decks_3.pth
   Average reward during training: 0.3000
   Saved at episode: 839
Loading DQN model for 4 deck(s)...
✅ Successfully loaded model for 4 deck(s) from blackjack_dqn_decks_4.pth
   Average reward during training: -0.4700
   Saved at episode: 212
Loading DQN model for 5 deck(s)...
✅ Successfully loaded model for 5 deck(s) from blackjack_dqn_decks_5.pth
   Average reward during training: -0.2800
   Saved at episode: 140
Loading DQN model for 6 deck(s)...
✅ Successfully loaded model for 6 deck(s) from blackjack_dqn_decks_6.pth
   Average rew

## Evaluation 1

In [25]:
import pandas as pd
import random
import torch
import numpy as np

# Helper to preprocess state matching your training code
def preprocess_state(state):
    """
    Converts the BlackjackEnv state to a format usable by the neural network
    State format: ((card1, card2), dealer_card, usable_ace, can_double)
    """
    player_cards, dealer_card, usable_ace, can_double = state
    
    # Convert dealer_card using card_value function
    dealer_value = card_value(dealer_card)
    
    # For player cards tuple, calculate sum using card_value
    player_sum = 0
    for card in player_cards:
        if card != 0:  # Skip zero values (padding)
            player_sum += card_value(card)
    
    return np.array([player_sum, dealer_value, usable_ace, can_double], dtype=np.float32)

# === RL Evaluation Simulation ===
def evaluate_dqn_on_deck_sizes(models, num_games=10000, max_decks=6):
    results = []

    for num_deck in range(1, max_decks + 1):
        env = BlackjackEnv(numdecks=num_deck, natural=True)
        q_network = models[num_deck]  # Get the specific model for this deck size

        wins = 0
        losses = 0
        draws = 0
        total_reward = 0

        for game in range(num_games):
            obs = env.reset(seed=42 + game)
            done = False
            episode_reward = 0

            while not done:
                # Process state to match training format
                state = preprocess_state(obs)
                
                try:
                    with torch.no_grad():
                        state_tensor = torch.FloatTensor(state).unsqueeze(0)
                        q_values = q_network(state_tensor)
                        
                        # Get valid actions
                        valid_actions = [0, 1]  # Default stick and hit
                        player_cards, dealer_card, usable_ace, can_double = obs
                        
                        if can_double:
                            valid_actions.append(2)
                            
                        if env.current_hand < len(env.hands):
                            current_hand = env.hands[env.current_hand]
                            if len(current_hand) == 2 and card_value(current_hand[0]) == card_value(current_hand[1]):
                                valid_actions.append(3)
                        
                        # Mask invalid actions
                        for action in range(env.action_space.n):
                            if action not in valid_actions:
                                q_values[0, action] = float('-inf')
                                
                        action = q_values.argmax().item()
                        
                    next_obs, reward, done, _ = env.step(action)
                    episode_reward += reward
                    obs = next_obs
                    
                except ValueError:
                    # If error, try a fallback action
                    action = 0  # Stick is usually safe
                    next_obs, reward, done, _ = env.step(action)
                    episode_reward += reward
                    obs = next_obs

            total_reward += episode_reward
            if episode_reward > 0:
                wins += 1
            elif episode_reward < 0:
                losses += 1
            else:
                draws += 1

        # Store results
        results.append({
            "Decks": num_deck,
            "Games": num_games,
            "Wins": wins,
            "Draws": draws,
            "Losses": losses,
            "Total Reward": round(total_reward, 4),
            "Win Rate (%)": round((wins / num_games) * 100, 4),
            "Loss Rate (%)": round((losses / num_games) * 100, 4),
            "Draw Rate (%)": round((draws / num_games) * 100, 4),
            "Average Reward": round(total_reward / num_games, 4)
        })

    return pd.DataFrame(results)

# Example usage:
df_rl = evaluate_dqn_on_deck_sizes(dqn_models, num_games=10000, max_decks=6)
df_rl

Unnamed: 0,Decks,Games,Wins,Draws,Losses,Total Reward,Win Rate (%),Loss Rate (%),Draw Rate (%),Average Reward
0,1,10000,5560,701,3739,2670.0,55.6,37.39,7.01,0.267
1,2,10000,3979,442,5579,-6759.0,39.79,55.79,4.42,-0.6759
2,3,10000,5781,594,3625,789.0,57.81,36.25,5.94,0.0789
3,4,10000,2465,282,7253,-10318.5,24.65,72.53,2.82,-1.0318
4,5,10000,3960,479,5561,-7372.5,39.6,55.61,4.79,-0.7372
5,6,10000,3644,424,5932,-5330.0,36.44,59.32,4.24,-0.533


In [26]:
# Function to select action with the DQN model (no exploration)
def select_action_eval(state, q_network, env):
    # Extract state components to check for valid actions
    player_cards, dealer_card, usable_ace, can_double = state
    
    # Default valid actions (stick and hit)
    valid_actions = [0, 1]  
    
    # Check if can double down
    if can_double:
        valid_actions.append(2)
    
    # Check if the current hand allows split
    if hasattr(env, 'current_hand') and hasattr(env, 'hands'):
        if env.current_hand < len(env.hands):
            current_hand = env.hands[env.current_hand]
            # Check if can split (same card value and exactly 2 cards)
            if len(current_hand) == 2 and card_value(current_hand[0]) == card_value(current_hand[1]):
                valid_actions.append(3)
    
    with torch.no_grad():
        state_tensor = torch.FloatTensor(preprocess_state(state)).unsqueeze(0)
        q_values = q_network(state_tensor)
        
        # Mask invalid actions by setting their Q-values to -inf
        for action in range(4):  # Assume 4 possible actions
            if action not in valid_actions:
                q_values[0, action] = float('-inf')
        
        return q_values.argmax().item()

def evaluate_dqn_bankroll(models, num_games=10000, max_decks=6, initial_money=100):
    """
    Evaluate DQN models with a bankroll simulation across different deck sizes
    """
    results = []

    for num_deck in range(1, max_decks + 1):
        random.seed(42)
        np.random.seed(42)
        torch.manual_seed(42)
        
        env = BlackjackEnv(numdecks=num_deck, natural=True)
        q_network = models[num_deck]  # Get the specific model for this deck size
        q_network.eval()  # Set the model to evaluation mode

        money = initial_money
        wins = 0
        losses = 0
        draws = 0
        total_reward = 0
        
        # For tracking bankruptcy
        games_played = 0
        went_bankrupt = False

        for game in range(1, num_games+1):
            if money <= 0:
                went_bankrupt = True
                games_played = game - 1
                break
                
            games_played = game
            obs = env.reset()
            done = False
            
            # Bet $1
            money -= 1
            episode_reward = 0
            doubled_down = False

            while not done:
                # Get action from the model
                action = select_action_eval(obs, q_network, env)
                
                # If doubling down, subtract another dollar
                if action == 2:  # Double down
                    money -= 1
                    doubled_down = True
                
                # Execute the action
                try:
                    next_obs, reward, done, _ = env.step(action)
                    episode_reward += reward
                    obs = next_obs
                except ValueError:
                    # Fallback if error
                    action = 0  # Stick
                    next_obs, reward, done, _ = env.step(action)
                    episode_reward += reward
                    obs = next_obs

            # End of episode accounting
            total_reward += episode_reward
            
            if episode_reward > 0:
                wins += 1
                # Calculate payout
                if doubled_down:
                    money += 4  # Get 2x the doubled bet
                else:
                    if episode_reward > 1:  # Blackjack
                        money += 2.5  # Blackjack pays 3:2
                    else:
                        money += 2  # Regular win
            elif episode_reward < 0:
                losses += 1
                # Money already subtracted for bet
            else:
                draws += 1
                if doubled_down:
                    money += 2  # Get doubled bet back
                else:
                    money += 1  # Get original bet back

        # Store results
        bankruptcy_message = f"Bankrupt after {games_played} games" if went_bankrupt else "Solvent"
        
        results.append({
            "Decks": num_deck,
            "Games": games_played,
            "Wins": wins,
            "Draws": draws,
            "Losses": losses,
            "Total Reward": round(total_reward, 4),
            "Win Rate (%)": round((wins / games_played) * 100, 4) if games_played > 0 else 0,
            "Loss Rate (%)": round((losses / games_played) * 100, 4) if games_played > 0 else 0,
            "Draw Rate (%)": round((draws / games_played) * 100, 4) if games_played > 0 else 0,
            "Average Reward": round(total_reward / games_played, 4) if games_played > 0 else 0,
            "Final Money": round(money, 2),
            "Status": bankruptcy_message
        })

    return pd.DataFrame(results)

# Evaluate DQN model performance using the bankroll experiment
print("\nRunning DQN bankroll experiment...")
df_dqn_bankroll = evaluate_dqn_bankroll(dqn_models, num_games=10000, max_decks=6, initial_money=100)
print("DQN bankroll experiment completed!")

# Display the results
df_dqn_bankroll


Running DQN bankroll experiment...
DQN bankroll experiment completed!


Unnamed: 0,Decks,Games,Wins,Draws,Losses,Total Reward,Win Rate (%),Loss Rate (%),Draw Rate (%),Average Reward,Final Money,Status
0,1,10000,5597,685,3718,2757.0,55.97,37.18,6.85,0.2757,3050.0,Solvent
1,2,401,167,14,220,-253.0,41.6459,54.8628,3.4913,-0.6309,-1.0,Bankrupt after 401 games
2,3,10000,5694,601,3705,540.5,56.94,37.05,6.01,0.0541,2358.5,Solvent
3,4,127,26,5,96,-144.0,20.4724,75.5906,3.937,-1.1339,0.0,Bankrupt after 127 games
4,5,166,54,7,105,-163.0,32.5301,63.253,4.2169,-0.9819,-1.0,Bankrupt after 166 games
5,6,391,142,17,232,-193.0,36.3171,59.335,4.3478,-0.4936,-1.5,Bankrupt after 391 games


# Set PPO

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import sys
import os
from collections import deque

# PPO Actor-Critic Network
class PPOActorCritic(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PPOActorCritic, self).__init__()
        self.shared = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU()
        )
        self.policy_head = nn.Linear(64, output_dim)
        self.value_head = nn.Linear(64, 1)

    def forward(self, x):
        shared_out = self.shared(x)
        logits = self.policy_head(shared_out)
        value = self.value_head(shared_out)
        return logits, value

# Adjusted for BlackjackEnv
def preprocess_state(state):
    """
    Converts the BlackjackEnv state to a format usable by the neural network
    State format: ((card1, card2), dealer_card, usable_ace, can_double)
    """
    player_cards, dealer_card, usable_ace, can_double = state
    
    # Convert dealer_card using card_value function
    dealer_value = card_value(dealer_card)
    
    # For player cards tuple, calculate sum using card_value
    player_sum = 0
    for card in player_cards:
        if card != 0:  # Skip zero values (padding)
            player_sum += card_value(card)
    
    return np.array([player_sum, dealer_value, usable_ace, can_double], dtype=np.float32)

# Function to select valid actions
def select_valid_action(logits, state, env):
    player_cards, dealer_card, usable_ace, can_double = state
    
    # Default valid actions (stick and hit)
    valid_actions = [0, 1]
    
    # Check if can double down
    if can_double:
        valid_actions.append(2)
    
    # Check if can split
    if env.current_hand < len(env.hands):
        current_hand = env.hands[env.current_hand]
        if len(current_hand) == 2 and card_value(current_hand[0]) == card_value(current_hand[1]):
            valid_actions.append(3)
    
    # Apply mask to logits
    masked_logits = logits.clone()
    for action in range(env.action_space.n):
        if action not in valid_actions:
            masked_logits[action] = float('-inf')
    
    return masked_logits

# Compute GAE
def compute_gae(rewards, values, dones, gamma=0.99, lam=0.95):
    returns = []
    advantages = []
    gae = 0
    next_value = 0

    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * next_value * (1 - dones[step]) - values[step]
        gae = delta + gamma * lam * (1 - dones[step]) * gae
        advantages.insert(0, gae)
        returns.insert(0, gae + values[step])
        next_value = values[step]

    return torch.FloatTensor(returns), torch.FloatTensor(advantages)

# PPO Training Function
def train_ppo(env, n_episodes=5000, gamma=0.99, lam=0.95, clip_eps=0.2,
              lr=3e-4, epochs=4, batch_size=64, model_save_path='ppo_blackjack.pth'):

    input_dim = 4  # [player_sum, dealer_card, usable_ace, can_double]
    output_dim = env.action_space.n

    policy_net = PPOActorCritic(input_dim, output_dim)
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)

    memory = []
    reward_window = deque(maxlen=100)

    best_avg_reward = float('-inf')
    best_model = None

    for episode in range(n_episodes):
        obs = env.reset()
        done = False
        episode_data = []
        episode_reward = 0

        while not done:
            state = preprocess_state(obs)
            state_tensor = torch.FloatTensor(state)
            logits, value = policy_net(state_tensor)
            
            # Mask invalid actions
            masked_logits = select_valid_action(logits, obs, env)
            probs = torch.softmax(masked_logits, dim=-1)
            dist = torch.distributions.Categorical(probs)
            
            try:
                action = dist.sample()
                log_prob = dist.log_prob(action)
                
                next_obs, reward, done, _ = env.step(action.item())
                episode_data.append((state, action.item(), reward, log_prob.item(), value.item(), done))
                episode_reward += reward
                obs = next_obs
                
            except ValueError:
                # Fallback to a safe action (stick)
                action = 0  # stick
                next_obs, reward, done, _ = env.step(action)
                
                # Re-compute log_prob for the fallback action
                masked_logits = select_valid_action(logits, obs, env)
                probs = torch.softmax(masked_logits, dim=-1)
                dist = torch.distributions.Categorical(probs)
                log_prob = dist.log_prob(torch.tensor(action))
                
                episode_data.append((state, action, reward, log_prob.item(), value.item(), done))
                episode_reward += reward
                obs = next_obs

        memory.extend(episode_data)
        reward_window.append(episode_reward)

        # Train when we have enough data
        if len(memory) >= batch_size:
            states, actions, rewards, old_log_probs, values, dones = zip(*memory)

            # Process states to tensors
            states_tensor = torch.FloatTensor(np.array(states))
            actions_tensor = torch.LongTensor(actions)
            old_log_probs_tensor = torch.FloatTensor(old_log_probs)
            
            returns, advantages = compute_gae(rewards, values, dones, gamma, lam)
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

            for _ in range(epochs):
                logits, value_preds = policy_net(states_tensor)
                probs = torch.softmax(logits, dim=-1)
                dist = torch.distributions.Categorical(probs)

                new_log_probs = dist.log_prob(actions_tensor)
                ratio = torch.exp(new_log_probs - old_log_probs_tensor)

                policy_loss = -torch.min(
                    ratio * advantages,
                    torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps) * advantages
                ).mean()

                value_loss = nn.MSELoss()(value_preds.squeeze(), returns)

                loss = policy_loss + 0.5 * value_loss - 0.01 * dist.entropy().mean()

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            memory = []

        # Save model based on rolling average
        if len(reward_window) == 100:
            avg_reward = np.mean(reward_window)
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward
                best_model = policy_net
                torch.save({
                    'model_state_dict': best_model.state_dict(),
                    'avg_reward': best_avg_reward,
                    'episode': episode + 1
                }, model_save_path)
                print(f"✅ Best model saved at episode {episode+1} | Avg Reward: {best_avg_reward:.4f}")

        if (episode + 1) % 500 == 0:
            print(f"Episode {episode+1} | Avg Reward: {np.mean(list(reward_window)):.4f}")

    return best_model if best_model else policy_net

## Training the model

In [None]:
# Train models for different deck counts
ppo_models = {}

for num_decks in range(1, 7):
    print(f"\n=== Training PPO model for {num_decks} deck(s) ===")
    env = BlackjackEnv(numdecks=num_decks, natural=True)
    model_save_path = f"blackjack_ppo_decks_{num_decks}.pth"
    ppo_model = train_ppo(env, n_episodes=10000, model_save_path=model_save_path)
    ppo_models[num_decks] = ppo_model
    print(f"Completed PPO training for {num_decks} deck(s)")

print("All PPO models trained successfully!")

## Evaluation

In [None]:
import pandas as pd
import random
import torch
import numpy as np
import sys
import os

# === PPO Evaluation on Deck Sizes ===
def evaluate_ppo_on_deck_sizes(models, num_games=10000, max_decks=6):
    results = []

    for num_deck in range(1, max_decks + 1):
        env = BlackjackEnv(numdecks=num_deck, natural=True)
        policy_net = models[num_deck]  # Get the specific model for this deck size

        wins = 0
        losses = 0
        draws = 0
        total_reward = 0

        for game in range(num_games):
            obs = env.reset(seed=42 + game)
            done = False
            episode_reward = 0

            while not done:
                # Process state to match training format
                state = preprocess_state(obs)
                
                try:
                    with torch.no_grad():
                        state_tensor = torch.FloatTensor(state)
                        logits, _ = policy_net(state_tensor)
                        
                        # Apply action masking
                        masked_logits = select_valid_action(logits, obs, env)
                        probs = torch.softmax(masked_logits, dim=-1)
                        
                        # For evaluation, choose the action with highest probability
                        action = torch.argmax(probs).item()
                    
                    next_obs, reward, done, _ = env.step(action)
                    episode_reward += reward
                    obs = next_obs
                    
                except ValueError:
                    # If error, try a fallback action
                    action = 0  # Stick is usually safe
                    next_obs, reward, done, _ = env.step(action)
                    episode_reward += reward
                    obs = next_obs

            total_reward += episode_reward
            if episode_reward > 0:
                wins += 1
            elif episode_reward < 0:
                losses += 1
            else:
                draws += 1

        # Store results
        results.append({
            "Decks": num_deck,
            "Games": num_games,
            "Wins": wins,
            "Draws": draws,
            "Losses": losses,
            "Total Reward": round(total_reward, 4),
            "Win Rate (%)": round((wins / num_games) * 100, 4),
            "Loss Rate (%)": round((losses / num_games) * 100, 4),
            "Draw Rate (%)": round((draws / num_games) * 100, 4),
            "Average Reward": round(total_reward / num_games, 4)
        })

    return pd.DataFrame(results)

def evaluate_ppo_bankroll(models, num_games=10000, max_decks=6, initial_money=100):
    """
    Evaluate PPO models with a bankroll simulation across different deck sizes
    """
    results = []

    for num_deck in range(1, max_decks + 1):
        env = BlackjackEnv(numdecks=num_deck, natural=True)
        policy_net = models[num_deck]  # Get the specific model for this deck size
        policy_net.eval()  # Set the model to evaluation mode

        money = initial_money
        wins = 0
        losses = 0
        draws = 0
        total_reward = 0
        
        # For tracking bankruptcy
        games_played = 0
        went_bankrupt = False

        for game in range(1, num_games+1):
            if money <= 0:
                went_bankrupt = True
                games_played = game - 1
                break
                
            games_played = game
            obs = env.reset()
            done = False
            
            # Bet $1
            money -= 1
            episode_reward = 0

            while not done:
                # Process state to match training format
                state = preprocess_state(obs)
                
                with torch.no_grad():
                    state_tensor = torch.FloatTensor(state)
                    logits, _ = policy_net(state_tensor)
                    
                    # Get valid actions for the current state
                    player_cards, dealer_card, usable_ace, can_double = obs
                    valid_actions = [0, 1]  # Stick, Hit are always valid
                    
                    if can_double:
                        valid_actions.append(2)  # Double down if allowed
                    
                    # Check for split (same rank cards)
                    if hasattr(env, 'current_hand') and hasattr(env, 'hands'):
                        if env.current_hand < len(env.hands):
                            current_hand = env.hands[env.current_hand]
                            if len(current_hand) == 2 and card_value(current_hand[0]) == card_value(current_hand[1]):
                                valid_actions.append(3)  # Split if allowed
                    
                    # Mask invalid actions
                    masked_logits = logits.clone()
                    for i in range(len(masked_logits)):
                        if i not in valid_actions:
                            masked_logits[i] = float('-inf')
                    
                    probs = torch.softmax(masked_logits, dim=-1)
                    action = torch.argmax(probs).item()
                
                # Check if action is valid (safeguard)
                if action not in valid_actions:
                    action = 0  # Default to stick if somehow invalid
                
                # If doubling down, subtract another dollar
                if action == 2:  # Double down
                    money -= 1
                
                # Execute the action
                try:
                    next_obs, reward, done, _ = env.step(action)
                    episode_reward += reward
                    obs = next_obs
                except ValueError:
                    # Fallback if error
                    action = 0  # Stick
                    next_obs, reward, done, _ = env.step(action)
                    episode_reward += reward
                    obs = next_obs

            # End of episode accounting
            total_reward += episode_reward
            
            if episode_reward > 0:
                wins += 1
                # Calculate payout based on bet
                if action == 2:  # Double down win
                    money += 4  # Get 2x the doubled bet
                else:
                    money += 2  # Regular win
            elif episode_reward < 0:
                losses += 1
                # Money already subtracted for bet
            else:
                draws += 1
                if action == 2:  # Double down push
                    money += 2  # Get doubled bet back
                else:
                    money += 1  # Get original bet back

        # Store results
        bankruptcy_message = f"Bankrupt after {games_played} games" if went_bankrupt else "Solvent"
        
        results.append({
            "Decks": num_deck,
            "Games": games_played,
            "Wins": wins,
            "Draws": draws,
            "Losses": losses,
            "Total Reward": round(total_reward, 4),
            "Win Rate (%)": round((wins / games_played) * 100, 4) if games_played > 0 else 0,
            "Loss Rate (%)": round((losses / games_played) * 100, 4) if games_played > 0 else 0,
            "Draw Rate (%)": round((draws / games_played) * 100, 4) if games_played > 0 else 0,
            "Average Reward": round(total_reward / games_played, 4) if games_played > 0 else 0,
            "Final Money": money,
            "Status": bankruptcy_message
        })

    return pd.DataFrame(results)

In [6]:
import torch
import os

# Load the saved PPO models
ppo_models = {}

for num_decks in range(1, 7):
    print(f"Loading PPO model for {num_decks} deck(s)...")
    model_path = f"blackjack_ppo_decks_{num_decks}.pth"
    
    # Check if the model file exists
    if os.path.exists(model_path):
        # Create a new model instance with the correct architecture
        input_dim = 4  # [player_sum, dealer_card, usable_ace, can_double]
        output_dim = 4  # [stick, hit, double, split]
        model = PPOActorCritic(input_dim, output_dim)
        
        # Load the saved weights
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        
        # Set to evaluation mode
        model.eval()
        
        # Store in the models dictionary
        ppo_models[num_decks] = model
        
        print(f"✅ Successfully loaded model for {num_decks} deck(s) from {model_path}")
        print(f"   Average reward during training: {checkpoint['avg_reward']:.4f}")
        print(f"   Saved at episode: {checkpoint['episode']}")
    else:
        print(f"❌ Model file not found: {model_path}")
        print(f"   Creating a new untrained model for {num_decks} deck(s)")
        
        # Create an untrained model as fallback
        input_dim = 4
        output_dim = 4
        model = PPOActorCritic(input_dim, output_dim)
        ppo_models[num_decks] = model

print("\nAll models loaded. Running evaluation...")

# Run the evaluation
df_ppo = evaluate_ppo_on_deck_sizes(ppo_models, num_games=10000, max_decks=6)

# Display the results
df_ppo

Loading PPO model for 1 deck(s)...
✅ Successfully loaded model for 1 deck(s) from blackjack_ppo_decks_1.pth
   Average reward during training: 0.4200
   Saved at episode: 3576
Loading PPO model for 2 deck(s)...
✅ Successfully loaded model for 2 deck(s) from blackjack_ppo_decks_2.pth
   Average reward during training: 0.4050
   Saved at episode: 3796
Loading PPO model for 3 deck(s)...
✅ Successfully loaded model for 3 deck(s) from blackjack_ppo_decks_3.pth
   Average reward during training: 0.4100
   Saved at episode: 9592
Loading PPO model for 4 deck(s)...
✅ Successfully loaded model for 4 deck(s) from blackjack_ppo_decks_4.pth
   Average reward during training: 0.4100
   Saved at episode: 5973
Loading PPO model for 5 deck(s)...
✅ Successfully loaded model for 5 deck(s) from blackjack_ppo_decks_5.pth
   Average reward during training: 0.3350
   Saved at episode: 6977
Loading PPO model for 6 deck(s)...
✅ Successfully loaded model for 6 deck(s) from blackjack_ppo_decks_6.pth
   Average r

Unnamed: 0,Decks,Games,Wins,Draws,Losses,Total Reward,Win Rate (%),Loss Rate (%),Draw Rate (%),Average Reward
0,1,10000,4616,661,4723,117.0,46.16,47.23,6.61,0.0117
1,2,10000,4635,651,4714,154.5,46.35,47.14,6.51,0.0155
2,3,10000,4799,677,4524,555.0,47.99,45.24,6.77,0.0555
3,4,10000,4538,623,4839,-76.0,45.38,48.39,6.23,-0.0076
4,5,10000,4751,716,4533,433.0,47.51,45.33,7.16,0.0433
5,6,10000,4690,719,4591,343.0,46.9,45.91,7.19,0.0343


In [8]:
df_ppo_bankroll = evaluate_ppo_bankroll(ppo_models, num_games=10000, max_decks=6, initial_money=100)
df_ppo_bankroll

Unnamed: 0,Decks,Games,Wins,Draws,Losses,Total Reward,Win Rate (%),Loss Rate (%),Draw Rate (%),Average Reward,Final Money,Status
0,1,10000,4651,610,4739,130.5,46.51,47.39,6.1,0.0131,12,Solvent
1,2,10000,4673,668,4659,249.5,46.73,46.59,6.68,0.0249,114,Solvent
2,3,10000,4777,717,4506,555.0,47.77,45.06,7.17,0.0555,407,Solvent
3,4,4568,2095,278,2195,13.0,45.8625,48.0517,6.0858,0.0028,0,Bankrupt after 4568 games
4,5,5000,2286,328,2386,14.5,45.72,47.72,6.56,0.0029,0,Bankrupt after 5000 games
5,6,10000,4713,675,4612,342.0,47.13,46.12,6.75,0.0342,209,Solvent
