# Set Environment (No Split, No Double Down)

In [1]:
import gym
from gym import spaces
from gym.utils import seeding
import random

# Full deck with distinct face cards
CARDS = [1, 2, 3, 4, 5, 6, 7, 8, 9, '10', 'J', 'Q', 'K'] * 4

def card_value(card):
    return 10 if card in ['10', 'J', 'Q', 'K'] else card

def draw_card(deck):
    return deck.pop()

def draw_hand(deck):
    return [draw_card(deck), draw_card(deck)]

def usable_ace(hand):
    return 1 in hand and sum(card_value(c) for c in hand) + 10 <= 21

def sum_hand(hand):
    total = sum(card_value(c) for c in hand)
    return total + 10 if usable_ace(hand) else total

def is_bust(hand):
    return sum_hand(hand) > 21

def score(hand):
    return 0 if is_bust(hand) else sum_hand(hand)

def is_natural(hand):
    return set(hand) == {1, '10'} or set(hand) == {1, 'J'} or set(hand) == {1, 'Q'} or set(hand) == {1, 'K'}

class BlackjackEnv(gym.Env):
    metadata = {"render.modes": ["human"]}

    def __init__(self, numdecks=4, natural=True):
        super().__init__()
        self.action_space = spaces.Discrete(2)  # 0: Stick, 1: Hit
        self.observation_space = spaces.Tuple((
            spaces.Tuple((spaces.Discrete(32), spaces.Discrete(32))),  # Player hand (2 cards)
            spaces.Discrete(11),  # Dealer's showing card
            spaces.Discrete(2)    # Usable ace
        ))

        self.natural = natural
        self.numdecks = numdecks
        self.decks = CARDS * self.numdecks
        random.shuffle(self.decks)
        self.seed()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        random.seed(seed)
        return [seed]

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        if seed is not None:
            self.seed(seed)

        if self._deck_is_out():
            self.decks = CARDS * self.numdecks
            random.shuffle(self.decks)

        self.dealer = draw_hand(self.decks)
        first_hand = draw_hand(self.decks)
        self.hands = [first_hand]
        self.current_hand = 0
        self.actionstaken = 0
        self.hand_results = []
        return self._get_obs()

    def step(self, action):
        assert self.action_space.contains(action), f"Invalid action: {action}"
        if self._deck_is_out():
            self.decks = CARDS * self.numdecks
            random.shuffle(self.decks)

        done = False
        reward = 0
        hand = self.hands[self.current_hand]

        if action == 0:  # Stick
            self._finalize_current_hand()

        elif action == 1:  # Hit
            hand.append(draw_card(self.decks))
            if is_bust(hand):
                self.hand_results.append(-1)
                self._advance_hand()

        self.actionstaken += 1

        if self.current_hand >= len(self.hands):
            while sum_hand(self.dealer) < 17:
                self.dealer.append(draw_card(self.decks))

            if len(self.hand_results) < len(self.hands):
                self._finalize_current_hand()

            reward = sum(self.hand_results)
            done = True

        return self._get_obs(), reward, done, {}

    def _finalize_current_hand(self):
        hand = self.hands[self.current_hand]
        player_score = score(hand)
        dealer_score = score(self.dealer)
        result = float(player_score > dealer_score) - float(player_score < dealer_score)
        if is_natural(hand) and result == 1 and self.natural:
            result = 1.5
        self.hand_results.append(result)
        self._advance_hand()

    def _advance_hand(self):
        self.current_hand += 1
        self.actionstaken = 0

    def _get_obs(self):
        if self.current_hand >= len(self.hands):
            return ((0, 0), card_value(self.dealer[0]), 0)

        hand = self.hands[self.current_hand]
        padded = hand[:2] + [0] * (2 - len(hand))
        return (
            tuple(card_value(c) if c != 0 else 0 for c in padded[:2]),
            card_value(self.dealer[0]),
            int(usable_ace(hand))
        )

    def _deck_is_out(self):
        return len(self.decks) < self.numdecks * len(CARDS) * 0.1

# Set the Simple DQN Model

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque
import copy
import os

# Define the Q-network
class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork, self).__init__()
        self.fc = nn.Sequential(
        nn.Linear(input_dim, 128),
        nn.ReLU(),
        nn.Linear(128, 128),
        nn.ReLU(),
        nn.Linear(128, output_dim)
    )
    
    def forward(self, x):
        return self.fc(x)
    
def preprocess_state(state):
    player_cards, dealer_card, usable_ace = state
    player_sum = sum(player_cards)
    return np.array([
        (player_sum - 4) / 17.0,     # Normalize to [0, 1]
        (dealer_card - 1) / 9.0,     # Normalize to [0, 1]
        usable_ace
    ], dtype=np.float32)

# === Action Selection: Epsilon-Greedy ===
def select_action(state, q_network, epsilon, action_space):
    if random.random() < epsilon:
        return action_space.sample()
    with torch.no_grad():
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        q_values = q_network(state_tensor)
        return q_values.argmax().item()
    
def select_action(state, q_network, epsilon, action_space):
    if random.random() < epsilon:
        return action_space.sample()
    with torch.no_grad():
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        q_values = q_network(state_tensor)
        return q_values.argmax().item()

def train_dqn(env, n_episodes=5000, gamma=0.99, lr=1e-3, batch_size=64,
              epsilon_start=1.0, epsilon_end=0.01, epsilon_decay=0.995,
              model_save_path='best_blackjack_dqn.pth'):

    input_dim = 3  # [normalized player_sum, dealer_card, usable_ace]
    output_dim = env.action_space.n

    q_network = QNetwork(input_dim, output_dim)
    target_network = copy.deepcopy(q_network)
    target_network.eval()

    optimizer = optim.Adam(q_network.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    replay_buffer = deque(maxlen=10000)
    epsilon = epsilon_start
    losses = []
    total_rewards = []

    best_model = None
    best_avg_loss = float('inf')
    loss_window = []

    steps_done = 0
    target_update_freq = 1000  # in steps

    for episode in range(n_episodes):
        state = preprocess_state(env.reset())
        done = False
        total_reward = 0

        while not done:
            action = select_action(state, q_network, epsilon, env.action_space)
            next_state_raw, reward, done, _ = env.step(action)
            next_state = preprocess_state(next_state_raw)
            reward = np.clip(reward, -1.0, 1.0)

            replay_buffer.append((state, action, reward, next_state, done))
            state = next_state
            total_reward += reward
            steps_done += 1

            # Learn if enough samples
            if len(replay_buffer) >= batch_size:
                batch = random.sample(replay_buffer, batch_size)
                states, actions, rewards, next_states, dones = zip(*batch)

                states_tensor = torch.FloatTensor(np.array(states))
                actions_tensor = torch.LongTensor(actions).unsqueeze(1)
                rewards_tensor = torch.FloatTensor(rewards).unsqueeze(1)
                next_states_tensor = torch.FloatTensor(np.array(next_states))
                dones_tensor = torch.BoolTensor(dones).unsqueeze(1)

                with torch.no_grad():
                    next_q_values = target_network(next_states_tensor).max(1, keepdim=True)[0]
                    targets = rewards_tensor + gamma * next_q_values * (~dones_tensor)

                q_values = q_network(states_tensor).gather(1, actions_tensor)

                loss = loss_fn(q_values, targets)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                losses.append(loss.item())
                loss_window.append(loss.item())
                if len(loss_window) > 100:
                    loss_window.pop(0)

            # Update target network
            if steps_done % target_update_freq == 0:
                target_network.load_state_dict(q_network.state_dict())

        # Epsilon decay
        epsilon = max(epsilon_end, epsilon * epsilon_decay)
        total_rewards.append(total_reward)

        # Save best model based on average loss
        if len(loss_window) == 100:
            avg_loss = np.mean(loss_window)
            if avg_loss < best_avg_loss:
                best_avg_loss = avg_loss
                best_model = copy.deepcopy(q_network)
                torch.save({
                    'model_state_dict': best_model.state_dict(),
                    'avg_loss': best_avg_loss,
                    'episode': episode + 1
                }, model_save_path)
                # print(f"✅ Best model saved at episode {episode+1} | Avg Loss: {best_avg_loss:.4f}")

    checkpoint = torch.load(model_save_path)
    # model.load_state_dict(checkpoint['model_state_dict'])
    # model.eval()
    print(f"Best Model at {checkpoint.get('episode', 'N/A')} | Avg Loss: {checkpoint.get('avg_loss', 'N/A')}")

    return best_model if best_model else q_network, losses

In [6]:
# Train models for different deck counts
print("Training DQN models for different deck counts ===")
dqn_models = {}

for num_decks in range(1, 7):
    print(f"\n=== Training model for {num_decks} deck(s) ===")
    env = BlackjackEnv(numdecks=num_decks, natural=False)
    model_save_path = f"blackjack_dqn_decks_{num_decks}.pth"
    model, _ = train_dqn(env, n_episodes=50000, model_save_path=model_save_path)
    dqn_models[num_decks] = model
    print(f"Completed training for {num_decks} deck(s)")

print("All models trained successfully!")

Training DQN models for different deck counts ===

=== Training model for 1 deck(s) ===
Best Model at 3211 | Avg Loss: 0.299788009673357
Completed training for 1 deck(s)

=== Training model for 2 deck(s) ===
Best Model at 41036 | Avg Loss: 0.3109839116036892
Completed training for 2 deck(s)

=== Training model for 3 deck(s) ===
Best Model at 3544 | Avg Loss: 0.3034180237352848
Completed training for 3 deck(s)

=== Training model for 4 deck(s) ===
Best Model at 998 | Avg Loss: 0.3201422664523125
Completed training for 4 deck(s)

=== Training model for 5 deck(s) ===
Best Model at 49395 | Avg Loss: 0.3064042545855045
Completed training for 5 deck(s)

=== Training model for 6 deck(s) ===
Best Model at 27321 | Avg Loss: 0.3119866617023945
Completed training for 6 deck(s)
All models trained successfully!


In [7]:
import pandas as pd
import random
import torch
import numpy as np
import os
import matplotlib.pyplot as plt

# Function to load a single DQN model
def load_dqn_model(num_decks):
    """
    Load a single DQN model for the specified number of decks
    """
    model_path = f"blackjack_dqn_decks_{num_decks}.pth"
    
    if os.path.exists(model_path):
        # Create a new model with the correct dimensions
        input_dim = 3  # [player_sum, dealer_card, usable_ace]
        output_dim = 2  # [stick, hit]
        model = QNetwork(input_dim, output_dim)
        
        # Load the saved weights
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        model.eval()  # Set to evaluation mode
        
        print(f"  Successfully loaded model from {model_path}")
        return model
    else:
        print(f"  ERROR: Model file not found at {model_path}")
        return None

# Helper to preprocess state
def preprocess_state(state):
    player_cards, dealer_card, usable_ace = state
    player_sum = sum(player_cards)
    return np.array([player_sum, dealer_card, usable_ace], dtype=np.float32)

# Evaluate a single DQN model on a specific deck size
def evaluate_dqn_model(q_network, num_decks, num_games=10000):
    
    env = BlackjackEnv(numdecks=num_decks, natural=False)

    wins = 0
    losses = 0
    draws = 0
    total_reward = 0
    count_natural_player = 0
    count_natural_dealer = 0

    for game in range(1, num_games+1):
        obs = env.reset(seed=game)
        state = preprocess_state(obs)
        done = False
        episode_reward = 0

        while not done:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                action = q_network(state_tensor).argmax().item()

            next_obs, reward, done, _ = env.step(action)
            if is_natural(env.hands[0]):
                count_natural_player += 1
            if is_natural(env.dealer):
                count_natural_dealer += 1
            state = preprocess_state(next_obs)
            episode_reward += reward

        total_reward += episode_reward
        if episode_reward > 0:
            wins += 1
        elif episode_reward < 0:
            losses += 1
        else:
            draws += 1

    # Return results as a dictionary
    return {
        "Decks": num_decks,
        "Games": num_games,
        "Wins": wins,
        "Draws": draws,
        "Losses": losses,
        "Total Reward": round(total_reward, 4),
        "Win Rate (%)": round((wins / num_games) * 100, 4),
        "Loss Rate (%)": round((losses / num_games) * 100, 4),
        "Draw Rate (%)": round((draws / num_games) * 100, 4),
        "Average Reward": round(total_reward / num_games, 4),
        "Natural Player": count_natural_player,
        "Natural Dealer": count_natural_dealer,
    }
    
# Load models for each deck count
print("\nLoading DQN models...")
dqn_models = {}

for num_decks in range(1, 7):
    print(f"Loading model for {num_decks} deck(s)...")
    model = load_dqn_model(num_decks)
    if model is not None:
        dqn_models[num_decks] = model

# Evaluate each model and collect results
print("\nEvaluating DQN models...")
evaluation_results = []

for num_decks, model in dqn_models.items():
    print(f"Evaluating model for {num_decks} deck(s)...")
    result = evaluate_dqn_model(model, num_decks, num_games=10000)
    evaluation_results.append(result)
    print(f"  Win Rate: {result['Win Rate (%)']:.2f}%, Avg Reward: {result['Average Reward']:.4f}")

# Convert results to DataFrame
df_dqn_results = pd.DataFrame(evaluation_results)
df_dqn_results


Loading DQN models...
Loading model for 1 deck(s)...
  Successfully loaded model from blackjack_dqn_decks_1.pth
Loading model for 2 deck(s)...
  Successfully loaded model from blackjack_dqn_decks_2.pth
Loading model for 3 deck(s)...
  Successfully loaded model from blackjack_dqn_decks_3.pth
Loading model for 4 deck(s)...
  Successfully loaded model from blackjack_dqn_decks_4.pth
Loading model for 5 deck(s)...
  Successfully loaded model from blackjack_dqn_decks_5.pth
Loading model for 6 deck(s)...
  Successfully loaded model from blackjack_dqn_decks_6.pth

Evaluating DQN models...
Evaluating model for 1 deck(s)...
  Win Rate: 46.18%, Avg Reward: -0.0096
Evaluating model for 2 deck(s)...
  Win Rate: 47.31%, Avg Reward: 0.0149
Evaluating model for 3 deck(s)...
  Win Rate: 47.57%, Avg Reward: 0.0219
Evaluating model for 4 deck(s)...
  Win Rate: 46.73%, Avg Reward: 0.0014
Evaluating model for 5 deck(s)...
  Win Rate: 44.73%, Avg Reward: -0.0306
Evaluating model for 6 deck(s)...
  Win Rate

Unnamed: 0,Decks,Games,Wins,Draws,Losses,Total Reward,Win Rate (%),Loss Rate (%),Draw Rate (%),Average Reward,Natural Player,Natural Dealer
0,1,10000,4618,668,4714,-96.0,46.18,47.14,6.68,-0.0096,465,497
1,2,10000,4731,687,4582,149.0,47.31,45.82,6.87,0.0149,479,433
2,3,10000,4757,705,4538,219.0,47.57,45.38,7.05,0.0219,501,450
3,4,10000,4673,668,4659,14.0,46.73,46.59,6.68,0.0014,473,474
4,5,10000,4473,748,4779,-306.0,44.73,47.79,7.48,-0.0306,457,487
5,6,10000,4607,697,4696,-89.0,46.07,46.96,6.97,-0.0089,481,499


## Evaluation 2

In [8]:
# Helper function to preprocess state
def preprocess_state(state):
    player_cards, dealer_card, usable_ace = state
    player_sum = sum(player_cards)
    return np.array([player_sum, dealer_card, usable_ace], dtype=np.float32)

# Define the bankroll evaluation function
def evaluate_dqn_bankroll(models, num_games=10000, max_decks=6, initial_money=100):
    """
    Evaluate DQN models with a bankroll simulation across different deck sizes
    """
    results = []

    for num_deck in range(1, max_decks + 1):
        
        env = BlackjackEnv(numdecks=num_deck, natural=False)
        q_network = models[num_deck]  # Get the specific model for this deck size
        q_network.eval()  # Set the model to evaluation mode

        money = initial_money
        wins = 0
        losses = 0
        draws = 0
        total_reward = 0
        count_natural_player = 0
        count_natural_dealer = 0
        
        # For tracking bankruptcy
        games_played = 0
        went_bankrupt = False

        for game in range(1, num_games+1):
            if money <= 0:
                went_bankrupt = True
                games_played = game - 1
                break
                
            games_played = game
            obs = env.reset(seed=game)
            done = False
            
            # Bet $1
            money -= 1
            episode_reward = 0
            doubled_down = False

            while not done:
                # Process state to match training format
                state = preprocess_state(obs)
                
                with torch.no_grad():
                    state_tensor = torch.FloatTensor(state).unsqueeze(0)
                    q_values = q_network(state_tensor)
                    
                    # Get valid actions for the current state
                    valid_actions = [0, 1]  # Stick, Hit are always valid
                    
                    # Mask invalid actions
                    masked_q_values = q_values.clone()
                    for i in range(q_values.size(1)):
                        if i not in valid_actions:
                            masked_q_values[0, i] = float('-inf')
                    
                    action = torch.argmax(masked_q_values, dim=1).item()
                
                # Check if action is valid (safeguard)
                if action not in valid_actions:
                    action = 0  # Default to stick if somehow invalid
                
                # Execute the action
                try:
                    next_obs, reward, done, _ = env.step(action)
                    if is_natural(env.hands[0]):
                        count_natural_player += 1
                    if is_natural(env.dealer):
                        count_natural_dealer += 1
                    episode_reward += reward
                    obs = next_obs
                except Exception as e:
                    # Fallback if error
                    print(f"Error executing action {action}: {e}")
                    action = 0  # Stick
                    next_obs, reward, done, _ = env.step(action)
                    
                    episode_reward += reward
                    obs = next_obs

                if done:
                    total_reward += episode_reward
                    
                    if episode_reward > 0:
                        wins += 1
                        money += 2
                    elif episode_reward < 0:
                        losses += 1
                    else:
                        draws += 1
                        money += 1

        # Store results
        bankruptcy_message = f"Bankrupt after {games_played} games" if went_bankrupt else "Solvent"
        
        results.append({
            "Decks": num_deck,
            "Games": games_played,
            "Wins": wins,
            "Draws": draws,
            "Losses": losses,
            "Total Reward": round(total_reward, 4),
            "Win Rate (%)": round((wins / games_played) * 100, 4) if games_played > 0 else 0,
            "Loss Rate (%)": round((losses / games_played) * 100, 4) if games_played > 0 else 0,
            "Draw Rate (%)": round((draws / games_played) * 100, 4) if games_played > 0 else 0,
            "Average Reward": round(total_reward / games_played, 4) if games_played > 0 else 0,
            "Final Money": round(money, 2),
            "Natural Player": count_natural_player,
            "Natural Dealer": count_natural_dealer,
            "Status": bankruptcy_message
        })
        
        print(f"Completed simulation for {num_deck} deck(s)")
        print(f"  Win Rate: {(wins/games_played)*100:.2f}%, Final Money: ${money:.2f}")

    return pd.DataFrame(results)

# Run the bankroll experiment
print("\nRunning DQN bankroll experiment...")
df_dqn_bankroll = evaluate_dqn_bankroll(dqn_models, num_games=10000, max_decks=6, initial_money=100)
df_dqn_bankroll


Running DQN bankroll experiment...
Completed simulation for 1 deck(s)
  Win Rate: 46.15%, Final Money: $0.00
Completed simulation for 2 deck(s)
  Win Rate: 47.31%, Final Money: $250.00
Completed simulation for 3 deck(s)
  Win Rate: 47.57%, Final Money: $319.00
Completed simulation for 4 deck(s)
  Win Rate: 46.73%, Final Money: $114.00
Completed simulation for 5 deck(s)
  Win Rate: 44.93%, Final Money: $0.00
Completed simulation for 6 deck(s)
  Win Rate: 46.50%, Final Money: $91.00


Unnamed: 0,Decks,Games,Wins,Draws,Losses,Total Reward,Win Rate (%),Loss Rate (%),Draw Rate (%),Average Reward,Final Money,Natural Player,Natural Dealer,Status
0,1,9166,4230,606,4330,-100.0,46.1488,47.2398,6.6114,-0.0109,0,420,451,Bankrupt after 9166 games
1,2,10000,4731,688,4581,150.0,47.31,45.81,6.88,0.015,250,481,433,Solvent
2,3,10000,4757,705,4538,219.0,47.57,45.38,7.05,0.0219,319,501,450,Solvent
3,4,10000,4673,668,4659,14.0,46.73,46.59,6.68,0.0014,114,473,474,Solvent
4,5,3009,1352,205,1452,-100.0,44.9319,48.2552,6.8129,-0.0332,0,140,140,Bankrupt after 3009 games
5,6,10000,4650,691,4659,-9.0,46.5,46.59,6.91,-0.0009,91,468,501,Solvent


# Set PPO

In [37]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

# PPO Actor-Critic Network
class PPOActorCritic(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PPOActorCritic, self).__init__()
        self.shared = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU()
        )
        self.policy_head = nn.Linear(64, output_dim)
        self.value_head = nn.Linear(64, 1)

    def forward(self, x):
        shared_out = self.shared(x)
        logits = self.policy_head(shared_out)
        value = self.value_head(shared_out)
        return logits, value

# Adjusted for BlackjackEnv
def preprocess_state(state):
    """
    Convert ((card1, card2), dealer_card, usable_ace) => [player_sum, dealer_card, usable_ace]
    """
    player_cards, dealer_card, usable_ace = state
    player_sum = sum(player_cards)
    return np.array([player_sum, dealer_card, usable_ace], dtype=np.float32)

# Compute GAE
def compute_gae(rewards, values, dones, gamma=0.99, lam=0.95):
    returns = []
    advantages = []
    gae = 0
    next_value = 0

    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * next_value * (1 - dones[step]) - values[step]
        gae = delta + gamma * lam * (1 - dones[step]) * gae
        advantages.insert(0, gae)
        returns.insert(0, gae + values[step])
        next_value = values[step]

    return torch.FloatTensor(returns), torch.FloatTensor(advantages)

# PPO Training Function
def train_ppo(env, n_episodes=5000, gamma=0.99, lam=0.95, clip_eps=0.2,
              lr=3e-4, epochs=4, batch_size=64, model_save_path='ppo_blackjack.pth'):

    input_dim = 3
    output_dim = env.action_space.n

    policy_net = PPOActorCritic(input_dim, output_dim)
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)

    memory = []
    # Add a reward window for tracking rolling average
    reward_window = deque(maxlen=100)

    best_avg_reward = float('-inf')
    best_model = None

    for episode in range(n_episodes):
        obs = env.reset()
        state = preprocess_state(obs)
        done = False
        episode_data = []
        episode_reward = 0

        while not done:
            state_tensor = torch.FloatTensor(state)
            logits, value = policy_net(state_tensor)
            probs = torch.softmax(logits, dim=-1)
            dist = torch.distributions.Categorical(probs)
            action = dist.sample()
            log_prob = dist.log_prob(action)

            next_obs, reward, done, _ = env.step(action.item())
            next_state = preprocess_state(next_obs)

            episode_data.append((state, action.item(), reward, log_prob.item(), value.item(), done))
            episode_reward += reward
            state = next_state

        memory.extend(episode_data)
        # Add episode reward to the rolling window
        reward_window.append(episode_reward)

        if len(memory) >= batch_size:
            states, actions, rewards, old_log_probs, values, dones = zip(*memory)

            returns, advantages = compute_gae(rewards, values, dones, gamma, lam)

            states_tensor = torch.FloatTensor(np.array(states))
            actions_tensor = torch.LongTensor(actions)
            old_log_probs_tensor = torch.FloatTensor(old_log_probs)
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

            for _ in range(epochs):
                logits, value_preds = policy_net(states_tensor)
                probs = torch.softmax(logits, dim=-1)
                dist = torch.distributions.Categorical(probs)

                new_log_probs = dist.log_prob(actions_tensor)
                ratio = torch.exp(new_log_probs - old_log_probs_tensor)

                policy_loss = -torch.min(
                    ratio * advantages,
                    torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps) * advantages
                ).mean()

                value_loss = nn.MSELoss()(value_preds.squeeze(), returns)

                loss = policy_loss + 0.5 * value_loss - 0.01 * dist.entropy().mean()

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            memory = []

        # Save model based on rolling average instead of single episode reward
        if len(reward_window) == reward_window.maxlen:  # Wait until window is full
            avg_reward = np.mean(reward_window)
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward
                best_model = policy_net
                torch.save({
                    'model_state_dict': best_model.state_dict(),
                    'avg_reward': best_avg_reward,
                    'episode': episode + 1
                }, model_save_path)
                # print(f"✅ Best model saved at episode {episode+1} | Avg Reward: {best_avg_reward:.4f}")

        if (episode + 1) % 10000 == 0:
            current_avg = np.mean(list(reward_window)) if reward_window else 0
            print(f"Episode {episode+1} | Avg Reward: {current_avg:.4f}")
    
    checkpoint = torch.load(model_save_path)
    # model.load_state_dict(checkpoint['model_state_dict'])
    # model.eval()
    print(f"Best Model at {checkpoint.get('episode', 'N/A')} | Avg Reward: {checkpoint.get('avg_reward', 'N/A')}")
    
    # checkpoint = torch.load(model_path)
    #         model.load_state_dict(checkpoint['model_state_dict'])
    #         model.eval()
    #         print(f"✅ Loaded model for {deck} deck(s) | Avg Reward: {checkpoint.get('avg_reward', 'N/A')}")
    #     else:
    return best_model if best_model else policy_net

In [38]:
# Train models for different deck counts
ppo_models = {}

for num_decks in range(1, 7):
    print(f"\n=== Training PPO model for {num_decks} deck(s) ===")
    env = BlackjackEnv(numdecks=num_decks, natural=False)
    model_save_path = f"blackjack_ppo_decks_{num_decks}.pth"
    ppo_model = train_ppo(env, n_episodes=50000, model_save_path=model_save_path)
    ppo_models[num_decks] = ppo_model
    print(f"Completed PPO training for {num_decks} deck(s)")

print("All PPO models trained successfully!")


=== Training PPO model for 1 deck(s) ===
Episode 10000 | Avg Reward: -0.0300
Episode 20000 | Avg Reward: -0.1000
Episode 30000 | Avg Reward: 0.1900
Episode 40000 | Avg Reward: 0.0500
Episode 50000 | Avg Reward: -0.0100
Best Model at 29893 | Avg Reward: 0.42
Completed PPO training for 1 deck(s)

=== Training PPO model for 2 deck(s) ===
Episode 10000 | Avg Reward: 0.0200
Episode 20000 | Avg Reward: -0.0600
Episode 30000 | Avg Reward: -0.0200
Episode 40000 | Avg Reward: 0.1100
Episode 50000 | Avg Reward: 0.1600
Best Model at 10305 | Avg Reward: 0.37
Completed PPO training for 2 deck(s)

=== Training PPO model for 3 deck(s) ===
Episode 10000 | Avg Reward: -0.0500
Episode 20000 | Avg Reward: 0.0100
Episode 30000 | Avg Reward: 0.0400
Episode 40000 | Avg Reward: 0.1300
Episode 50000 | Avg Reward: 0.1300
Best Model at 14370 | Avg Reward: 0.42
Completed PPO training for 3 deck(s)

=== Training PPO model for 4 deck(s) ===
Episode 10000 | Avg Reward: 0.1700
Episode 20000 | Avg Reward: 0.0700
Epi

In [44]:
env = BlackjackEnv(numdecks=3, natural=False)
model_save_path = f"blackjack_ppo_decks_{num_decks}.pth"
ppo_model = train_ppo(env, n_episodes=50000, model_save_path=model_save_path)

Episode 10000 | Avg Reward: 0.1100
Episode 20000 | Avg Reward: 0.1100
Episode 30000 | Avg Reward: 0.1000
Episode 40000 | Avg Reward: 0.0600
Episode 50000 | Avg Reward: 0.1600
Best Model at 14932 | Avg Reward: 0.4


In [39]:
import pandas as pd
import random
import torch
import numpy as np
import sys
import os

# === PPO Evaluation on Deck Sizes ===
def evaluate_ppo_on_deck_sizes(models, num_games=10000, max_decks=6):
    results = []

    for num_deck in range(1, max_decks + 1):
        env = BlackjackEnv(numdecks=num_deck, natural=False)
        policy_net = models[num_deck]  # Get the specific model for this deck size

        wins = 0
        losses = 0
        draws = 0
        total_reward = 0
        count_natural_player = 0
        count_natural_dealer = 0

        for game in range(1, num_games+1):
            obs = env.reset(seed=game)
            done = False
            episode_reward = 0

            while not done:
                # Process state to match training format
                state = preprocess_state(obs)
                
                with torch.no_grad():
                    state_tensor = torch.FloatTensor(state)
                    logits, _ = policy_net(state_tensor)
                    probs = torch.softmax(logits, dim=-1)
                    
                    # For evaluation, choose the action with highest probability
                    action = torch.argmax(probs).item()
                    
                    # Keep action in valid range [0, 1] for simple env
                    action = min(action, 1)  
                
                next_obs, reward, done, _ = env.step(action)
                episode_reward += reward
                obs = next_obs
                
                # Check for naturals when game ends
                if done:
                    if hasattr(env, 'hands') and len(env.hands) > 0:
                        if is_natural(env.hands[0]):
                            count_natural_player += 1
                    
                    if hasattr(env, 'dealer'):
                        if is_natural(env.dealer):
                            count_natural_dealer += 1

            total_reward += episode_reward
            if episode_reward > 0:
                wins += 1
            elif episode_reward < 0:
                losses += 1
            else:
                draws += 1

        # Store results
        results.append({
            "Decks": num_deck,
            "Games": num_games,
            "Wins": wins,
            "Draws": draws,
            "Losses": losses,
            "Total Reward": round(total_reward, 4),
            "Win Rate (%)": round((wins / num_games) * 100, 4),
            "Loss Rate (%)": round((losses / num_games) * 100, 4),
            "Draw Rate (%)": round((draws / num_games) * 100, 4),
            "Average Reward": round(total_reward / num_games, 4),
            "Natural Player": count_natural_player,
            "Natural Dealer": count_natural_dealer
        })

    return pd.DataFrame(results)

def evaluate_ppo_bankroll(models, num_games=10000, max_decks=6, initial_money=100):
    """
    Evaluate PPO models with a bankroll simulation across different deck sizes
    """
    results = []

    for num_deck in range(1, max_decks + 1):
        env = BlackjackEnv(numdecks=num_deck, natural=False)
        policy_net = models[num_deck]  # Get the specific model for this deck size
        policy_net.eval()  # Set the model to evaluation mode

        money = initial_money
        wins = 0
        losses = 0
        draws = 0
        total_reward = 0
        count_natural_player = 0
        count_natural_dealer = 0
        
        # For tracking bankruptcy
        games_played = 0
        went_bankrupt = False

        for game in range(1, num_games+1):
            if money <= 0:
                went_bankrupt = True
                games_played = game - 1
                break
                
            games_played = game
            obs = env.reset(seed=game)
            done = False
            
            # Bet $1
            money -= 1
            episode_reward = 0

            while not done:
                # Process state to match training format
                state = preprocess_state(obs)
                
                with torch.no_grad():
                    state_tensor = torch.FloatTensor(state)
                    logits, _ = policy_net(state_tensor)
                    probs = torch.softmax(logits, dim=-1)
                    
                    # For simple environment, use only actions 0 and 1
                    masked_logits = logits.clone()
                    for i in range(len(masked_logits)):
                        if i > 1:  # Only keep stick (0) and hit (1)
                            masked_logits[i] = float('-inf')
                    
                    probs = torch.softmax(masked_logits, dim=-1)
                    action = torch.argmax(probs).item()
                
                # Execute the action
                next_obs, reward, done, _ = env.step(action)
                episode_reward += reward
                obs = next_obs
                
                # Check for naturals when game ends
                if done:
                    if hasattr(env, 'hands') and len(env.hands) > 0:
                        if is_natural(env.hands[0]):
                            count_natural_player += 1
                    
                    if hasattr(env, 'dealer'):
                        if is_natural(env.dealer):
                            count_natural_dealer += 1

            # End of episode accounting
            total_reward += episode_reward
            
            # Update wins/losses/draws and bankroll
            if episode_reward > 0:
                wins += 1
                money += 2  # Return original $1 bet plus $1 winnings
            elif episode_reward < 0:
                losses += 1
                # Money already subtracted for bet
            else:
                draws += 1
                money += 1  # Return original bet for a push

        # Store results
        bankruptcy_message = f"Bankrupt after {games_played} games" if went_bankrupt else "Solvent"
        
        results.append({
            "Decks": num_deck,
            "Games": games_played,
            "Wins": wins,
            "Draws": draws,
            "Losses": losses,
            "Total Reward": round(total_reward, 4),
            "Win Rate (%)": round((wins / games_played) * 100, 4) if games_played > 0 else 0,
            "Loss Rate (%)": round((losses / games_played) * 100, 4) if games_played > 0 else 0,
            "Draw Rate (%)": round((draws / games_played) * 100, 4) if games_played > 0 else 0,
            "Average Reward": round(total_reward / games_played, 4) if games_played > 0 else 0,
            "Final Money": round(money, 2),
            "Natural Player": count_natural_player,
            "Natural Dealer": count_natural_dealer,
            "Status": bankruptcy_message
        })

    return pd.DataFrame(results)

In [43]:
def load_ppo_models(base_path, max_decks=6):
    """Load all PPO models for different deck sizes"""
    models = {}
    for deck in range(1, max_decks + 1):
        # Define input dim for your specific environment
        input_dim = 3  # player_sum, dealer_card, usable_ace
        output_dim = 2  # Stick, Hit
        
        model = PPOActorCritic(input_dim, output_dim)
        model_path = f"blackjack_ppo_decks_{deck}.pth"
        
        if os.path.exists(model_path):
            checkpoint = torch.load(model_path)
            model.load_state_dict(checkpoint['model_state_dict'])
            model.eval()
            print(f"✅ Loaded model for {deck} deck(s) | Avg Reward: {checkpoint.get('avg_reward', 'N/A')}")
        else:
            print(f"⚠️ No model found for {deck} deck(s) at {model_path}")
        
        models[deck] = model
    
    return models

max_decks = 6
ppo_models = load_ppo_models("./", max_decks)
df_ppo_eval = evaluate_ppo_on_deck_sizes(ppo_models, num_games=10000, max_decks=max_decks)
df_ppo_eval

✅ Loaded model for 1 deck(s) | Avg Reward: 0.42
✅ Loaded model for 2 deck(s) | Avg Reward: 0.37
✅ Loaded model for 3 deck(s) | Avg Reward: 0.42
✅ Loaded model for 4 deck(s) | Avg Reward: 0.49
✅ Loaded model for 5 deck(s) | Avg Reward: 0.45
✅ Loaded model for 6 deck(s) | Avg Reward: 0.38


Unnamed: 0,Decks,Games,Wins,Draws,Losses,Total Reward,Win Rate (%),Loss Rate (%),Draw Rate (%),Average Reward,Natural Player,Natural Dealer
0,1,10000,4685,683,4632,53.0,46.85,46.32,6.83,0.0053,503,478
1,2,10000,4686,656,4658,28.0,46.86,46.58,6.56,0.0028,513,463
2,3,10000,4621,645,4734,-113.0,46.21,47.34,6.45,-0.0113,469,486
3,4,10000,4676,683,4641,35.0,46.76,46.41,6.83,0.0035,483,503
4,5,10000,4613,758,4629,-16.0,46.13,46.29,7.58,-0.0016,483,502
5,6,10000,4731,655,4614,117.0,47.31,46.14,6.55,0.0117,491,476


In [45]:
df_ppo_bankroll = evaluate_ppo_bankroll(ppo_models, num_games=10000, max_decks=max_decks, initial_money=100)
df_ppo_bankroll

Unnamed: 0,Decks,Games,Wins,Draws,Losses,Total Reward,Win Rate (%),Loss Rate (%),Draw Rate (%),Average Reward,Final Money,Natural Player,Natural Dealer,Status
0,1,10000,4698,682,4620,78.0,46.98,46.2,6.82,0.0078,178,507,478,Solvent
1,2,10000,4686,656,4658,28.0,46.86,46.58,6.56,0.0028,128,513,463,Solvent
2,3,6941,3188,465,3288,-100.0,45.93,47.3707,6.6993,-0.0144,0,324,331,Bankrupt after 6941 games
3,4,10000,4691,713,4596,95.0,46.91,45.96,7.13,0.0095,195,481,483,Solvent
4,5,10000,4613,758,4629,-16.0,46.13,46.29,7.58,-0.0016,84,483,502,Solvent
5,6,10000,4731,655,4614,117.0,47.31,46.14,6.55,0.0117,217,491,476,Solvent


In [25]:
def evaluate_strategy_adherence(models, max_decks=6):
    """Compare agent's strategy against basic strategy for blackjack"""
    results = []
    
    # Basic strategy table (simplified example)
    # Format: {player_sum: {dealer_upcard: action}}
    # 0 = stand, 1 = hit
    basic_strategy = {
        # Hard totals
        21: {card: 0 for card in range(1, 11)},  # Always stand on 21
        20: {card: 0 for card in range(1, 11)},  # Always stand on 20
        19: {card: 0 for card in range(1, 11)},  # Always stand on 19
        18: {card: 0 for card in range(1, 11)},  # Always stand on 18
        17: {card: 0 for card in range(1, 11)},  # Always stand on 17
        16: {1: 1, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1},  # Stand vs 2-6, hit vs 7-A
        15: {1: 1, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1},
        14: {1: 1, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1},
        13: {1: 1, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1},
        12: {1: 1, 2: 1, 3: 1, 4: 0, 5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1},
        11: {card: 1 for card in range(1, 11)},  # Always hit on 11 or less
        10: {card: 1 for card in range(1, 11)},
        9: {card: 1 for card in range(1, 11)},
        8: {card: 1 for card in range(1, 11)},
        # Soft totals with usable_ace=1
        'A,9': {card: 0 for card in range(1, 11)},  # Always stand
        'A,8': {card: 0 for card in range(1, 11)},  # Always stand
        'A,7': {1: 1, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1},
        'A,6': {1: 1, 2: 1, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 1, 10: 1},
        'A,5': {1: 1, 2: 1, 3: 1, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 1},
        'A,4': {1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 0, 7: 0, 8: 0, 9: 0, 10: 1},
        'A,3': {1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 0, 9: 0, 10: 1},
        'A,2': {1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 0, 10: 1},
    }
    
    for num_deck in range(1, max_decks + 1):
        policy_net = models[num_deck]
        policy_net.eval()
        
        agreement_count = 0
        total_decisions = 0
        
        # Test all player sums (4-21) vs all dealer upcards (1-10)
        for player_sum in range(4, 22):
            for dealer_card in range(1, 11):
                for usable_ace in [0, 1]:
                    # Skip invalid combinations
                    if usable_ace == 1 and player_sum < 12:
                        continue
                    
                    # Get basic strategy recommendation
                    if usable_ace == 1 and player_sum <= 21:
                        # For soft hands
                        ace_value = player_sum - 11
                        key = f'A,{ace_value}' if f'A,{ace_value}' in basic_strategy else player_sum
                    else:
                        key = player_sum
                    
                    if key in basic_strategy and dealer_card in basic_strategy[key]:
                        basic_action = basic_strategy[key][dealer_card]
                        
                        # Get agent's decision
                        state = np.array([player_sum, dealer_card, usable_ace], dtype=np.float32)
                        with torch.no_grad():
                            state_tensor = torch.FloatTensor(state)
                            logits, _ = policy_net(state_tensor)
                            probs = torch.softmax(logits, dim=-1)
                            agent_action = torch.argmax(probs[:2]).item()  # Only consider hit/stand
                        
                        if agent_action == basic_action:
                            agreement_count += 1
                        
                        total_decisions += 1
        
        agreement_percentage = (agreement_count / total_decisions) * 100 if total_decisions > 0 else 0
        
        results.append({
            "Decks": num_deck,
            "Total Decisions": total_decisions,
            "Strategy Agreement": agreement_count,
            "Agreement Percentage": round(agreement_percentage, 2),
        })
    
    return pd.DataFrame(results)

evaluate_strategy_adherence(ppo_models, max_decks=6)
evaluate_strategy_adherence(dqn_models, max_decks=6)

IndexError: slice() cannot be applied to a 0-dim tensor.