**Part 3**

* Adding vectorized BlackJack Environment
* Adding Device Agnostic code (GPU Training)
* Attempting to use Softmax (Categorical Distribution) implementation instead of Sigmoid (Binary Bernoulli Distribution)

**Results**

* Still very slow during training

# Imports

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import numpy as np
import gymnasium as gym

# Testing

In [None]:
env = gym.make("Blackjack-v1", sab=True) # `render_mode="human"` creates a pygame popup window to analyze play # `sab=True` uses the Sutton & Barto version

# Agent

In [None]:
class BlackJackAgent(nn.Module):
    def __init__(self, obs_size=3, hidden_size=10, output_size=2):
        super(BlackJackAgent, self).__init__()
        self.layer_1 = nn.Linear(obs_size, hidden_size)
        self.layer_2 = nn.Linear(hidden_size, output_size)
        self.action_probs_activation_layer = nn.Softmax(dim=1)

    def forward(self, x):
        x = torch.relu(self.layer_1(x))
        logits = self.layer_2(x)
        return logits       # later use nn.Softmax to get probabilities

    def get_action_probs(self, logits):
        """Get the probabilities of each action."""
        return self.action_probs_activation_layer(logits)

    def sample_action(self, action:None):
        """Get the probability of choosing the action"""
        logits = self.forward(action)
        probs = self.get_action_probs(logits)
        dist = torch.distributions.Categorical(probs=probs)
        action = dist.sample().item()
        prob_of_action = dist.log_prob(action)
        return action, prob_of_action

# Training Loop

In [None]:
def training_blackjack_agent(epochs=50, learning_rate=0.0001, batch_size=64, gamma=0.99, k_epochs=64, epsilon=0.2, beta_kl=0.01, max_grad_norm=0.5, entropy_coeff=0.01, log_iterations=10, device="cpu", num_envs=16) -> BlackJackAgent:
    print(f"Training BlackJack Agent's Policy on {device} with {epochs} epochs, {learning_rate} learning rate, batch size {batch_size}, and KL beta {beta_kl}.")

    vec_env = gym.make_vec("Blackjack-v1", num_envs=num_envs, sab=True) # `sab=True` uses the Sutton & Barto version

    # steps_per_env_per_rollout = batch_size // num_envs if batch_size % num_envs == 0 else (batch_size // num_envs) + 1

    New_Policy = BlackJackAgent().to(device)   # STEP 3 ||
    optimizer = optim.Adam(params=New_Policy.parameters(), lr=learning_rate)


    for epoch in tqdm(range(epochs), desc=f"Main Epoch (Outer Loop)", leave=False):     # STEP 4 ||
        # STEP 5 || Sample a batch D_b from D --> OMITTED
        # STEP 6 || Update the old policy model PI old <- PI new
        Policy_Old = BlackJackAgent().to(device)
        Policy_Old.load_state_dict(New_Policy.state_dict())
        Policy_Old.eval()   # Prevent Gradient tracking

        # This will store trajectories for all episodes collected in the current batch
        completed_batch_trajectories = []

        # Reset all vectorized environments
        raw_observations, infos = vec_env.reset() # observations is a numpy array of shape (num_envs, obs_dim(3))
        observations = np.stack(raw_observations, axis=1)
        dones = np.array([False] * num_envs) # Track the done status for each parallel environment
        truncateds = np.array([False] * num_envs) # Track truncated status for each parallel environment

        # Initialize current trajectories for all parallel environments
        # Each element in this list will be a dict for an *in-progress* episode in a specific env
        current_episode_trajectories = [{"states": [], "actions": [], "rewards": [], "log_probs": []} for _ in range(num_envs)]

        # --- STEP 7 Collect a Batch of Experiences Using the Old Policy---
        # Loop Agent prediction, recording trajectories to lists:
        episodes_collected_in_batch = 0
        max_steps_per_batch_limit = batch_size * 5 # A safety limit to prevent infinite loops if episodes are very long
        current_total_steps = 0

        while episodes_collected_in_batch < batch_size and current_total_steps < max_steps_per_batch_limit:
            obs_tensor = torch.tensor(observations, dtype=torch.float32).to(device)

            with torch.no_grad():
                logits = Policy_Old(obs_tensor)
                dist = torch.distributions.Categorical(logits=logits)
                actions = dist.sample() # Tensor of shape [1]
                log_probs = dist.log_prob(actions)

            raw_next_obs, rewards, dones, truncateds, infos = vec_env.step(actions.cpu().numpy()) # actions must be on CPU for env.step()
            next_obs = np.stack(raw_next_obs, axis=1)
            current_total_steps += num_envs

            # Process data for each parallel environment
            for env_idx in range(num_envs):

                obs_to_append = observations[env_idx]
                if isinstance(obs_to_append, torch.Tensor):
                    obs_to_append = obs_to_append.cpu().numpy()
                # Store current_episode_trajectories
                current_episode_trajectories[env_idx]["states"].append(obs_to_append)
                current_episode_trajectories[env_idx]["actions"].append(actions[env_idx].item())
                current_episode_trajectories[env_idx]["rewards"].append(rewards[env_idx])
                current_episode_trajectories[env_idx]["log_probs"].append(log_probs[env_idx].cpu())

                if dones[env_idx] or truncateds[env_idx]:
                    completed_batch_trajectories.append(current_episode_trajectories[env_idx])
                    episodes_collected_in_batch += 1

                    # Reset this specific environment
                    # new_obs, new_info = vec_env.reset_at(env_idx)
                    # observations[env_idx] = new_obs

                    current_episode_trajectories[env_idx] = {"states": [], "actions": [], "rewards": [], "log_probs": []}

            observations = next_obs  # Update the observation

        for env_idx in range(num_envs):
            if len(current_episode_trajectories[env_idx]["states"]) > 0:
                # If there's partial data, it means the episode was still running
                # when `batch_size` was met. You'll need to decide how to handle this.
                # For simplicity for now, we'll append them. In full PPO, you'd add
                # the value of the last state to its rewards.
                completed_batch_trajectories.append(current_episode_trajectories[env_idx])
                # Note: These might not be "full" episodes in the sense of reaching a done state,
                # but they contribute steps to your batch.

        # These lists will hold data from ALL episodes in the current batch for Advantage Calculation
        all_states = []
        all_actions = []
        all_old_log_probs = []
        all_discounted_rewards = []

        # STEP 8 || Calculate Discounted Rewards for completed trajectories
        for episode_trajectory in completed_batch_trajectories:
            rewards = episode_trajectory["rewards"]
            states = episode_trajectory["states"]
            actions = episode_trajectory["actions"]
            log_probs = episode_trajectory["log_probs"]

            if not rewards:
                continue

            discounted_reward = 0
            returns_for_episode = []
            for reward in reversed(rewards):
                discounted_reward = reward + gamma * discounted_reward
                returns_for_episode.insert(0, discounted_reward)

            discounted_rewards = torch.tensor(returns_for_episode, dtype=torch.float32)
            # print(f"discounted_rewards size: {discounted_rewards.size()}")
            # Add each trajectory information for the batch
            if states:
                all_states.extend(states)
                all_actions.extend(actions)
                all_old_log_probs.extend(log_probs)
                all_discounted_rewards.extend(discounted_rewards.tolist())

        # --- IMPORTANT: Pre-tensorization checks and conversions ---
        if not all_states or not all_actions or not all_old_log_probs or not all_discounted_rewards:
            print(f"Warning: Epoch {epoch + 1}: Insufficient data collected for optimization. "
                  f"Skipping policy update for this epoch.")
            print(f"  Counts: States={len(all_states)}, Actions={len(all_actions)}, "
                  f"LogProbs={len(all_old_log_probs)}, Rewards={len(all_discounted_rewards)}")
            continue
        # Convert all collected batch data into PyTorch tensors
        all_states_tensor = torch.tensor(np.array(all_states), dtype=torch.float32).to(device)
        all_actions_tensor = torch.tensor(all_actions, dtype=torch.long).to(device)
        # Stack individual log_prob tensors and then flatten if necessary
        all_old_log_probs_tensor = torch.tensor(all_old_log_probs, dtype=torch.float32).to(device) # Ensure it's a 1D tensor
        all_discounted_rewards_tensor = torch.tensor(all_discounted_rewards, dtype=torch.float32).to(device)

        # STEP 9 || Calculate the Advantage of each Time Step for each Trajectory using normalization
        all_advantages_tensor = (all_discounted_rewards_tensor - all_discounted_rewards_tensor.mean()) / (all_discounted_rewards_tensor.std() + 1e-8)

        # Detach these tensors from any computation graph history
        # as they represent fixed data for the policy updates in k_epochs.
        # This prevents the "RuntimeError: Trying to backward through the graph a second time".
        all_states_tensor = all_states_tensor.detach()
        all_actions_tensor = all_actions_tensor.detach()
        all_old_log_probs_tensor = all_old_log_probs_tensor.detach()
        all_advantages_tensor = all_advantages_tensor.detach()

        New_Policy.train()  # Prepare NN for updates

        # --- STEP 10 || GRPO Optimization ---
        for k_epoch in tqdm(range(k_epochs), desc=f"Epoch {epoch+1}/{epochs} (Inner K-Epochs)", leave=True):
            new_logits = New_Policy(all_states_tensor)
            new_dist = torch.distributions.Categorical(logits=new_logits)
            new_log_probs = new_dist.log_prob(all_actions_tensor)
            entropy = new_dist.entropy().mean() # Calculate entropy for regularization

            R1_ratio = torch.exp(new_log_probs - all_old_log_probs_tensor)

            unclipped_surrogate = R1_ratio * all_advantages_tensor
            clipped_surrogate = torch.clamp(input=R1_ratio, min=1.0-epsilon, max=1.0+epsilon) * all_advantages_tensor

            policy_loss = -torch.min(unclipped_surrogate, clipped_surrogate).mean()

            # --- KL Divergence Calculation ---
            # Create distributions for old policies using the trajectory states
            with torch.no_grad():
                old_logits = Policy_Old(all_states_tensor)
            old_dist = torch.distributions.Categorical(logits=old_logits)

            # Calculate KL divergence per sample, then take the mean over the batch
            kl_div_per_sample = torch.distributions.kl.kl_divergence(p=new_dist, q=old_dist)
            kl_loss = kl_div_per_sample.mean() # Mean over the batch

            # Total Loss for GRPO
            total_loss = policy_loss + beta_kl * kl_loss - entropy_coeff * entropy

            # STEP 11 || Policy Updates
            optimizer.zero_grad()
            total_loss.backward()
            torch.nn.utils.clip_grad_norm_(New_Policy.parameters(), max_grad_norm)
            optimizer.step()    # Update policy parameters using gradient ascent


        # --- 4. Logging and Evaluation ---
        if (epoch + 1) % log_iterations == 0:
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss.item():.4f}, Ratio: {R1_ratio.mean().item():.5f}, Entropy Term: {entropy:.5f}")
            # You can add more evaluation metrics here, e.g., average reward per episode
            # For Blackjack, the reward is often -1, 0, or 1.
            avg_reward = sum(sum(ep["rewards"]) for ep in completed_batch_trajectories) / len(completed_batch_trajectories) if len(completed_batch_trajectories) > 0 else 0
            print(f"Average reward per episode in batch: {avg_reward:.2f}")

    New_Policy.eval()   # Change to eval mode for evaluation


    vec_env.close() # Close the environment after training
    print("Training complete.")
    return New_Policy # Return the trained policy

In [None]:
_ = training_blackjack_agent()

Training BlackJack Agent's Policy on cpu with 50 epochs, 0.0001 learning rate, batch size 64, and KL beta 0.01.


Epoch 1/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 517.61it/s]
Epoch 2/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 448.16it/s]
Epoch 3/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 472.10it/s]
Epoch 4/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 490.62it/s]
Epoch 5/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 528.15it/s]
Epoch 6/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 556.57it/s]
Epoch 7/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 528.17it/s]
Epoch 8/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 477.56it/s]
Epoch 9/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 483.50it/s]
Epoch 10/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 511.50it/s]
Main Epoch (Outer Loop):  20%|██        | 10/50 [00:01<00:05,  7.07it/s]

Epoch 10/50, Loss: -0.0110, Ratio: 0.99524, Entropy Term: 0.35876
Average reward per episode in batch: -0.16


Epoch 11/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 615.60it/s]
Epoch 12/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 573.55it/s]
Epoch 13/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 598.65it/s]
Epoch 14/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 615.44it/s]
Epoch 15/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 609.45it/s]
Epoch 16/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 622.96it/s]
Epoch 17/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 653.11it/s]
Epoch 18/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 622.52it/s]
Epoch 19/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 584.66it/s]
Epoch 20/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 641.82it/s]
Main Epoch (Outer Loop):  40%|████      | 20/50 [00:02<00:03,  8.69it/s]

Epoch 20/50, Loss: -0.0063, Ratio: 1.00221, Entropy Term: 0.28721
Average reward per episode in batch: -0.18


Epoch 21/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 653.10it/s]
Epoch 22/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 608.77it/s]
Epoch 23/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 627.92it/s]
Epoch 24/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 620.55it/s]
Epoch 25/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 614.03it/s]
Epoch 26/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 584.60it/s]
Epoch 27/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 600.39it/s]
Epoch 28/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 600.34it/s]
Epoch 29/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 621.77it/s]
Epoch 30/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 577.23it/s]
Main Epoch (Outer Loop):  60%|██████    | 30/50 [00:03<00:02,  8.48it/s]

Epoch 30/50, Loss: -0.0112, Ratio: 0.99839, Entropy Term: 0.29060
Average reward per episode in batch: -0.09


Epoch 31/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 609.67it/s]
Epoch 32/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 596.59it/s]
Epoch 33/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 616.18it/s]
Epoch 34/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 622.35it/s]
Epoch 35/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 606.46it/s]
Epoch 36/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 586.03it/s]
Epoch 37/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 615.86it/s]
Epoch 38/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 647.81it/s]
Epoch 39/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 614.55it/s]
Epoch 40/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 605.74it/s]
Main Epoch (Outer Loop):  80%|████████  | 40/50 [00:04<00:01,  8.66it/s]

Epoch 40/50, Loss: -0.0134, Ratio: 0.99445, Entropy Term: 0.26544
Average reward per episode in batch: -0.11


Epoch 41/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 629.17it/s]
Epoch 42/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 557.60it/s]
Epoch 43/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 661.70it/s]
Epoch 44/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 628.00it/s]
Epoch 45/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 654.32it/s]
Epoch 46/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 605.13it/s]
Epoch 47/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 602.86it/s]
Epoch 48/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 628.72it/s]
Epoch 49/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 599.55it/s]
Epoch 50/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 578.43it/s]
                                                                        

Epoch 50/50, Loss: -0.0075, Ratio: 1.00002, Entropy Term: 0.29992
Average reward per episode in batch: 0.07
Training complete.




Training BlackJack Agent's Policy on cpu with 50 epochs, 0.0001 learning rate, batch size 64, and KL beta 0.01.
                                                               
(array([21, 12, 19, 19, 20, 15, 18, 16, 10,  7, 19, 10, 17, 15, 14,  9]), array([ 5,  5,  1,  4, 10, 10,  3,  9,  2,  6,  1,  6,  9,  5,  9,  8]), array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))


In [None]:
_ = training_blackjack_agent(device="cuda")

Training BlackJack Agent's Policy on cuda with 50 epochs, 0.0001 learning rate, batch size 64, and KL beta 0.01.


Epoch 1/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 155.30it/s]
Epoch 2/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 256.06it/s]
Epoch 3/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 278.24it/s]
Epoch 4/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 284.68it/s]
Epoch 5/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 307.77it/s]
Epoch 6/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 318.34it/s]
Epoch 7/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 342.41it/s]
Epoch 8/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 304.59it/s]
Epoch 9/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 324.50it/s]
Epoch 10/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 304.27it/s]
Main Epoch (Outer Loop):  20%|██        | 10/50 [00:02<00:09,  4.27it/s]

Epoch 10/50, Loss: -0.0106, Ratio: 0.99860, Entropy Term: 0.42374
Average reward per episode in batch: -0.29


Epoch 11/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 322.07it/s]
Epoch 12/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 326.85it/s]
Epoch 13/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 303.20it/s]
Epoch 14/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 313.83it/s]
Epoch 15/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 320.23it/s]
Epoch 16/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 304.54it/s]
Epoch 17/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 334.44it/s]
Epoch 18/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 307.94it/s]
Epoch 19/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 330.05it/s]
Epoch 20/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 305.78it/s]
Main Epoch (Outer Loop):  40%|████      | 20/50 [00:04<00:06,  4.40it/s]

Epoch 20/50, Loss: -0.0095, Ratio: 0.99948, Entropy Term: 0.29227
Average reward per episode in batch: -0.23


Epoch 21/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 323.01it/s]
Epoch 22/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 312.17it/s]
Epoch 23/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 322.06it/s]
Epoch 24/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 297.45it/s]
Epoch 25/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 300.94it/s]
Epoch 26/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 317.61it/s]
Epoch 27/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 320.67it/s]
Epoch 28/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 336.81it/s]
Epoch 29/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 329.01it/s]
Epoch 30/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 321.81it/s]
Main Epoch (Outer Loop):  60%|██████    | 30/50 [00:07<00:04,  4.59it/s]

Epoch 30/50, Loss: -0.0030, Ratio: 0.99875, Entropy Term: 0.16488
Average reward per episode in batch: -0.12


Epoch 31/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 318.62it/s]
Epoch 32/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 338.63it/s]
Epoch 33/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 311.90it/s]
Epoch 34/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 288.53it/s]
Epoch 35/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 334.05it/s]
Epoch 36/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 317.15it/s]
Epoch 37/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 327.81it/s]
Epoch 38/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 294.92it/s]
Epoch 39/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 316.69it/s]
Epoch 40/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 272.92it/s]
Main Epoch (Outer Loop):  80%|████████  | 40/50 [00:09<00:02,  4.18it/s]

Epoch 40/50, Loss: -0.0162, Ratio: 0.98002, Entropy Term: 0.21612
Average reward per episode in batch: -0.10


Epoch 41/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 302.20it/s]
Epoch 42/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 299.70it/s]
Epoch 43/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 297.24it/s]
Epoch 44/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 307.48it/s]
Epoch 45/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 296.50it/s]
Epoch 46/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 329.87it/s]
Epoch 47/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 291.51it/s]
Epoch 48/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 309.53it/s]
Epoch 49/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 308.86it/s]
Epoch 50/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 313.46it/s]
                                                                        

Epoch 50/50, Loss: -0.0038, Ratio: 0.99892, Entropy Term: 0.26437
Average reward per episode in batch: -0.04
Training complete.




Training BlackJack Agent's Policy with 10 epochs, 0.0001 learning rate, batch size 4, and KL beta 0.01.
* Batch of Trajectories:
* [{'states': [(12, 10, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.1239])]},
* {'states': [(20, 7, 0)], 'actions': [0], 'rewards': [1.0], 'log_probs': [tensor([-0.0815])]},
* {'states': [(12, 1, 0), (17, 1, 0)], 'actions': [1, 1], 'rewards': [0.0, -1.0], 'log_probs': [tensor([-1.5968]), tensor([-1.9474])]},
* {'states': [(6, 6, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.2144])]},
* {'states': [(7, 4, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.2734])]},
* {'states': [(13, 3, 1)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.1471])]},
* {'states': [(15, 10, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.1000])]},
* {'states': [(12, 10, 0)], 'actions': [0], 'rewards': [1.0], 'log_probs': [tensor([-0.1239])]},
* {'states': [(14, 7, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.1320])]}]

In [None]:
# Example usage (assuming you have a way to call this function, e.g., in a main block)
if __name__ == '__main__':
    # You can adjust these parameters as needed
    # Using a larger batch_size for more stable training and to reduce empty batch issues
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")   # Device Agnostic Code
    trained_policy = training_blackjack_agent(
        epochs=2000,
        learning_rate=0.0003,
        batch_size=2048, # Significantly larger batch size recommended for stability
        k_epochs=128,
        epsilon=0.2,
        beta_kl=0.01,
        entropy_coeff=0.001,
        log_iterations=100,
        gamma=0.99,
        device=device,
        num_envs=16
    )

    print("\nTesting the trained policy:")
    test_env = gym.make("Blackjack-v1", sab=True)
    total_test_rewards = 0
    num_test_episodes = 1000

    for _ in range(num_test_episodes):
        obs, _ = test_env.reset()
        done = False
        truncated = False
        episode_reward = 0
        while not done and not truncated:
            obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(device)
            with torch.no_grad():
                logits = trained_policy(obs_tensor)
                dist = torch.distributions.Categorical(logits=logits)
                action = dist.sample()
            obs, reward, done, truncated, _ = test_env.step(action.item())
            episode_reward += reward
        total_test_rewards += episode_reward

    print(f"Average reward over {num_test_episodes} test episodes: {total_test_rewards / num_test_episodes:.4f}")
    test_env.close()

Training BlackJack Agent's Policy on cuda with 2000 epochs, 0.0003 learning rate, batch size 2048, and KL beta 0.01.


Main Epoch (Outer Loop):   0%|          | 0/2000 [00:00<?, ?it/s]
Epoch 1/2000 (Inner K-Epochs):   0%|          | 0/128 [00:00<?, ?it/s][A
Epoch 1/2000 (Inner K-Epochs):   1%|          | 1/128 [00:00<00:55,  2.27it/s][A
Epoch 1/2000 (Inner K-Epochs):  25%|██▌       | 32/128 [00:00<00:01, 77.36it/s][A
Epoch 1/2000 (Inner K-Epochs):  51%|█████     | 65/128 [00:00<00:00, 141.71it/s][A
Epoch 1/2000 (Inner K-Epochs): 100%|██████████| 128/128 [00:00<00:00, 151.71it/s]
Main Epoch (Outer Loop):   0%|          | 1/2000 [00:02<1:17:00,  2.31s/it]
Epoch 2/2000 (Inner K-Epochs):   0%|          | 0/128 [00:00<?, ?it/s][A
Epoch 2/2000 (Inner K-Epochs):  26%|██▌       | 33/128 [00:00<00:00, 320.87it/s][A
Epoch 2/2000 (Inner K-Epochs):  52%|█████▏    | 66/128 [00:00<00:00, 322.98it/s][A
Epoch 2/2000 (Inner K-Epochs): 100%|██████████| 128/128 [00:00<00:00, 319.13it/s]
Main Epoch (Outer Loop):   0%|          | 2/2000 [00:03<52:49,  1.59s/it]  
Epoch 3/2000 (Inner K-Epochs):   0%|          | 0/128

Epoch 100/2000, Loss: -0.0012, Ratio: 0.99942, Entropy Term: 0.17746
Average reward per episode in batch: -0.06



Epoch 101/2000 (Inner K-Epochs):   0%|          | 0/128 [00:00<?, ?it/s][A
Epoch 101/2000 (Inner K-Epochs):  24%|██▍       | 31/128 [00:00<00:00, 302.30it/s][A
Epoch 101/2000 (Inner K-Epochs):  48%|████▊     | 62/128 [00:00<00:00, 301.64it/s][A
Epoch 101/2000 (Inner K-Epochs):  73%|███████▎  | 93/128 [00:00<00:00, 304.40it/s][A
Epoch 101/2000 (Inner K-Epochs): 100%|██████████| 128/128 [00:00<00:00, 298.39it/s]
Main Epoch (Outer Loop):   5%|▌         | 101/2000 [02:10<42:51,  1.35s/it]
Epoch 102/2000 (Inner K-Epochs):   0%|          | 0/128 [00:00<?, ?it/s][A
Epoch 102/2000 (Inner K-Epochs):  23%|██▎       | 30/128 [00:00<00:00, 298.38it/s][A
Epoch 102/2000 (Inner K-Epochs):  49%|████▉     | 63/128 [00:00<00:00, 311.65it/s][A
Epoch 102/2000 (Inner K-Epochs):  74%|███████▍  | 95/128 [00:00<00:00, 311.84it/s][A
Epoch 102/2000 (Inner K-Epochs): 100%|██████████| 128/128 [00:00<00:00, 305.98it/s]
Main Epoch (Outer Loop):   5%|▌         | 102/2000 [02:11<41:49,  1.32s/it]
Epoch 103/2

Epoch 200/2000, Loss: -0.0014, Ratio: 0.99958, Entropy Term: 0.11769
Average reward per episode in batch: -0.04



Epoch 201/2000 (Inner K-Epochs):   0%|          | 0/128 [00:00<?, ?it/s][A
Epoch 201/2000 (Inner K-Epochs):  25%|██▌       | 32/128 [00:00<00:00, 310.47it/s][A
Epoch 201/2000 (Inner K-Epochs):  50%|█████     | 64/128 [00:00<00:00, 288.19it/s][A
Epoch 201/2000 (Inner K-Epochs):  73%|███████▎  | 93/128 [00:00<00:00, 268.82it/s][A
Epoch 201/2000 (Inner K-Epochs): 100%|██████████| 128/128 [00:00<00:00, 278.74it/s]
Main Epoch (Outer Loop):  10%|█         | 201/2000 [04:26<39:34,  1.32s/it]
Epoch 202/2000 (Inner K-Epochs):   0%|          | 0/128 [00:00<?, ?it/s][A
Epoch 202/2000 (Inner K-Epochs):  22%|██▏       | 28/128 [00:00<00:00, 272.86it/s][A
Epoch 202/2000 (Inner K-Epochs):  46%|████▌     | 59/128 [00:00<00:00, 292.25it/s][A
Epoch 202/2000 (Inner K-Epochs):  70%|███████   | 90/128 [00:00<00:00, 298.68it/s][A
Epoch 202/2000 (Inner K-Epochs): 100%|██████████| 128/128 [00:00<00:00, 291.80it/s]
Main Epoch (Outer Loop):  10%|█         | 202/2000 [04:28<39:06,  1.31s/it]
Epoch 203/2

Epoch 300/2000, Loss: -0.0008, Ratio: 0.99944, Entropy Term: 0.09965
Average reward per episode in batch: -0.09



Epoch 301/2000 (Inner K-Epochs):   0%|          | 0/128 [00:00<?, ?it/s][A
Epoch 301/2000 (Inner K-Epochs):  26%|██▌       | 33/128 [00:00<00:00, 320.91it/s][A
Epoch 301/2000 (Inner K-Epochs):  52%|█████▏    | 66/128 [00:00<00:00, 317.99it/s][A
Epoch 301/2000 (Inner K-Epochs): 100%|██████████| 128/128 [00:00<00:00, 311.68it/s]
Main Epoch (Outer Loop):  15%|█▌        | 301/2000 [06:44<37:07,  1.31s/it]
Epoch 302/2000 (Inner K-Epochs):   0%|          | 0/128 [00:00<?, ?it/s][A
Epoch 302/2000 (Inner K-Epochs):  26%|██▌       | 33/128 [00:00<00:00, 322.90it/s][A
Epoch 302/2000 (Inner K-Epochs):  52%|█████▏    | 66/128 [00:00<00:00, 307.54it/s][A
Epoch 302/2000 (Inner K-Epochs):  76%|███████▌  | 97/128 [00:00<00:00, 265.20it/s][A
Epoch 302/2000 (Inner K-Epochs): 100%|██████████| 128/128 [00:00<00:00, 259.30it/s]
Main Epoch (Outer Loop):  15%|█▌        | 302/2000 [06:45<37:08,  1.31s/it]
Epoch 303/2000 (Inner K-Epochs):   0%|          | 0/128 [00:00<?, ?it/s][A
Epoch 303/2000 (Inner

Epoch 400/2000, Loss: -0.0016, Ratio: 0.99964, Entropy Term: 0.08793
Average reward per episode in batch: -0.07



Epoch 401/2000 (Inner K-Epochs):   0%|          | 0/128 [00:00<?, ?it/s][A
Epoch 401/2000 (Inner K-Epochs):  15%|█▍        | 19/128 [00:00<00:00, 188.07it/s][A
Epoch 401/2000 (Inner K-Epochs):  33%|███▎      | 42/128 [00:00<00:00, 210.36it/s][A
Epoch 401/2000 (Inner K-Epochs):  52%|█████▏    | 67/128 [00:00<00:00, 226.92it/s][A
Epoch 401/2000 (Inner K-Epochs):  70%|███████   | 90/128 [00:00<00:00, 218.91it/s][A
Epoch 401/2000 (Inner K-Epochs): 100%|██████████| 128/128 [00:00<00:00, 222.00it/s]
Main Epoch (Outer Loop):  20%|██        | 401/2000 [09:00<39:12,  1.47s/it]
Epoch 402/2000 (Inner K-Epochs):   0%|          | 0/128 [00:00<?, ?it/s][A
Epoch 402/2000 (Inner K-Epochs):  25%|██▌       | 32/128 [00:00<00:00, 311.86it/s][A
Epoch 402/2000 (Inner K-Epochs):  50%|█████     | 64/128 [00:00<00:00, 308.43it/s][A
Epoch 402/2000 (Inner K-Epochs):  74%|███████▍  | 95/128 [00:00<00:00, 306.01it/s][A
Epoch 402/2000 (Inner K-Epochs): 100%|██████████| 128/128 [00:00<00:00, 299.00it/s]
M

took 32 minutes to run using the CPU

Parameters:


epochs=2000,
        learning_rate=0.0003,
        batch_size=2048, # Significantly larger batch size recommended for stability
        k_epochs=128,
        epsilon=0.2,
        beta_kl=0.01,
        entropy_coeff=0.001,
        log_iterations=100,
        gamma=0.99

In [None]:
test_env = gym.make("Blackjack-v1", render_mode="rgb", sab=True)
total_test_rewards = 0

NameError: name 'gym' is not defined

In [None]:
num_test_episodes = 10

In [None]:
for episode in range(num_test_episodes):
    print(f"Resetting env for episode: {episode}")
    obs, _ = test_env.reset()
    done = False
    truncated = False
    episode_reward = 0
    stored_obs=[]
    while not done and not truncated:
        obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            logits = trained_policy(obs_tensor)
            dist = torch.distributions.Categorical(logits=logits)
            action = dist.sample()
            print(f"obs_tensor: {obs_tensor} || Action taken: {action}")
        obs, reward, done, truncated, _ = test_env.step(action.item())
        episode_reward += reward
        if (truncated): print("truncated")
    print(f"Reward: {episode_reward} || Final Observation: {obs}")

Resetting env for episode: 0
obs_tensor: tensor([[13.,  2.,  0.]]) || Action taken: tensor([1])
obs_tensor: tensor([[14.,  2.,  0.]]) || Action taken: tensor([1])
Reward: -1.0 || Final Observation: (23, 2, 0)
Resetting env for episode: 1
obs_tensor: tensor([[10., 10.,  0.]]) || Action taken: tensor([1])
obs_tensor: tensor([[20., 10.,  0.]]) || Action taken: tensor([0])
Reward: 1.0 || Final Observation: (20, 10, 0)
Resetting env for episode: 2
obs_tensor: tensor([[18., 10.,  0.]]) || Action taken: tensor([0])
Reward: -1.0 || Final Observation: (18, 10, 0)
Resetting env for episode: 3
obs_tensor: tensor([[12., 10.,  0.]]) || Action taken: tensor([1])
Reward: -1.0 || Final Observation: (22, 10, 0)
Resetting env for episode: 4
obs_tensor: tensor([[21.,  9.,  1.]]) || Action taken: tensor([0])
Reward: 1.0 || Final Observation: (21, 9, 1)
Resetting env for episode: 5
obs_tensor: tensor([[19.,  2.,  0.]]) || Action taken: tensor([0])
Reward: 1.0 || Final Observation: (19, 2, 0)
Resetting env 

: 

In [None]:
env.close()

Currently the final state which reveals what the dealer ended up with in the end is not shown. By trying to access the dealer's final hand or by adding custom logging within the environment, you'll gain the critical information needed to definitively understand the why behind each reward.