**Part 2**

* Adding Device Agnostic code (GPU Training)
* Attempting to use Softmax (Categorical Distribution) implementation instead of Sigmoid (Binary Bernoulli Distribution)

**Results**

Took Way too long to run with GPU/CPU switching

* High delay when transferring from CPU to GPU and vice versa.
* Look into vectorized environments

# Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import gymnasium as gym

# Testing

In [2]:
env = gym.make("Blackjack-v1", sab=True) # `render_mode="human"` creates a pygame popup window to analyze play # `sab=True` uses the Sutton & Barto version

# Agent

In [3]:
class BlackJackAgent(nn.Module):
    def __init__(self, obs_size=3, hidden_size=10, output_size=2):
        super(BlackJackAgent, self).__init__()
        self.layer_1 = nn.Linear(obs_size, hidden_size)
        self.layer_2 = nn.Linear(hidden_size, output_size)
        self.action_probs_activation_layer = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = torch.relu(self.layer_1(x))
        logits = self.layer_2(x)
        return logits       # later use nn.Softmax to get probabilities

    def get_action_probs(self, logits):
        """Get the probabilities of each action."""
        return self.action_probs_activation_layer(logits)
    
    def sample_action(self, action:None):
        """Get the probability of choosing the action"""
        logits = self.forward(action)
        probs = self.get_action_probs(logits)
        dist = torch.distributions.Categorical(probs=probs)
        action = dist.sample().item()
        prob_of_action = dist.log_prob(action)
        return action, prob_of_action

# Training Loop

In [None]:
def training_blackjack_agent(epochs=50, learning_rate=0.0001, batch_size=64, gamma=0.99, k_epochs=64, epsilon=0.2, beta_kl=0.01, max_grad_norm=0.5, entropy_coeff=0.01, log_iterations=10, device="cpu") -> BlackJackAgent: 
    print(f"Training BlackJack Agent's Policy on {device} with {epochs} epochs, {learning_rate} learning rate, batch size {batch_size}, and KL beta {beta_kl}.")

    env = gym.make("Blackjack-v1", sab=True) # # `sab=True` uses the Sutton & Barto version
    New_Policy = BlackJackAgent().to(device)   # STEP 3 || 
    optimizer = optim.Adam(params=New_Policy.parameters(), lr=learning_rate)
    # num_correct = 0.0

    for epoch in tqdm(range(epochs), desc=f"Main Epoch (Outer Loop)", leave=False):     # STEP 4 || 
        # STEP 3 || CREATE REFERENCE MODEL OMITTED
        batch_trajectories = []     # Will contain a batch of trajectories

        # STEP 5 || Sample a batch D_b from D --> OMITTED 
        # STEP 6 || Update the old policy model PI old <- PI new
        Policy_Old = BlackJackAgent().to(device)
        Policy_Old.load_state_dict(New_Policy.state_dict())
        Policy_Old.eval()   # Prevent Gradient tracking

        # --- STEP 7 Collect a Batch of Experiences ---
        # Loop Agent prediction, recording trajectories to lists:
        for i in range(batch_size):
            
            # Create local trajectory library
            episode_trajectory = {"states": [], "actions": [], "rewards": [], "log_probs": []}
            obs, _ = env.reset()
            done, truncated = False, False
            while not done and not truncated:
                obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(device) # add batch dim to feed to NN
                with torch.no_grad():
                    logits = Policy_Old(obs_tensor)
                    dist = torch.distributions.Categorical(logits=logits)
                    action = dist.sample() # Tensor of shape [1]
                    log_prob = dist.log_prob(action)
                    
                next_obs, reward, done, truncated, info = env.step(action.item())

                # Store episode_Trajectory
                episode_trajectory["states"].append(obs)
                episode_trajectory["actions"].append(action.item())
                episode_trajectory["rewards"].append(reward)
                episode_trajectory["log_probs"].append(log_prob)
                
                obs = next_obs  # Update the observation
                if (truncated):
                    print("Debug: EPISODE TRUNCATED")

            batch_trajectories.append(episode_trajectory)

            # print(f"Batch of Trajectories at current epoch:{epoch}:\n{batch_trajectories}")


        # These lists will hold data from ALL episodes in the current batch for Advantage Calculation
        all_states = []
        all_actions = []
        all_old_log_probs = []
        all_discounted_rewards = []

        # STEP 8 || Calculate Discounted Rewards
        for episode_trajectory in batch_trajectories:
            rewards = episode_trajectory["rewards"]
            states = episode_trajectory["states"]
            actions = episode_trajectory["actions"]
            log_probs = episode_trajectory["log_probs"]
            
            discounted_reward = 0
            returns_for_episode = []
            for reward in reversed(rewards):
                discounted_reward = reward + gamma * discounted_reward
                returns_for_episode.insert(0, discounted_reward)

            discounted_rewards = torch.tensor(returns_for_episode, dtype=torch.float32)
            # print(f"discounted_rewards size: {discounted_rewards.size()}")
            # Add each trajectory information for the batch
            if states:
                all_states.extend(states)
                all_actions.extend(actions)
                all_old_log_probs.extend(log_probs)
                all_discounted_rewards.extend(discounted_rewards.tolist())

        # Convert all collected batch data into PyTorch tensors
        all_states_tensor = torch.tensor(all_states, dtype=torch.float32).to(device)
        all_actions_tensor = torch.tensor(all_actions, dtype=torch.long).to(device)
        # Stack individual log_prob tensors and then flatten if necessary
        all_old_log_probs_tensor = torch.cat(all_old_log_probs).squeeze(-1).to(device) # Ensure it's a 1D tensor
        all_discounted_rewards_tensor = torch.tensor(all_discounted_rewards, dtype=torch.float32).to(device)

        # STEP 9 || Calculate the Advantage of each Time Step for each Trajectory using normalization
        all_advantages_tensor = (all_discounted_rewards_tensor - all_discounted_rewards_tensor.mean()) / (all_discounted_rewards_tensor.std() + 1e-8)

        # Detach these tensors from any computation graph history
        # as they represent fixed data for the policy updates in k_epochs.
        # This prevents the "RuntimeError: Trying to backward through the graph a second time".
        all_states_tensor = all_states_tensor.detach()
        all_actions_tensor = all_actions_tensor.detach()
        all_old_log_probs_tensor = all_old_log_probs_tensor.detach()
        all_advantages_tensor = all_advantages_tensor.detach()

        New_Policy.train()  # Prepare NN for updates

        # --- STEP 10 || GRPO Optimization ---
        for k_epoch in tqdm(range(k_epochs), desc=f"Epoch {epoch+1}/{epochs} (Inner K-Epochs)", leave=True):
            new_logits = New_Policy(all_states_tensor)
            new_dist = torch.distributions.Categorical(logits=new_logits)
            new_log_probs = new_dist.log_prob(all_actions_tensor)
            entropy = new_dist.entropy().mean() # Calculate entropy for regularization

            R1_ratio = torch.exp(new_log_probs - all_old_log_probs_tensor)

            unclipped_surrogate = R1_ratio * all_advantages_tensor
            clipped_surrogate = torch.clamp(input=R1_ratio, min=1.0-epsilon, max=1.0+epsilon) * all_advantages_tensor

            policy_loss = -torch.min(unclipped_surrogate, clipped_surrogate).mean()

            # --- KL Divergence Calculation ---
            # Create distributions for old policies using the trajectory states
            with torch.no_grad():
                old_logits = Policy_Old(all_states_tensor)
            old_dist = torch.distributions.Categorical(logits=old_logits)

            # Calculate KL divergence per sample, then take the mean over the batch
            kl_div_per_sample = torch.distributions.kl.kl_divergence(p=new_dist, q=old_dist)
            kl_loss = kl_div_per_sample.mean() # Mean over the batch

            # Total Loss for GRPO
            total_loss = policy_loss + beta_kl * kl_loss - entropy_coeff * entropy

            # STEP 11 || Policy Updates
            optimizer.zero_grad()
            total_loss.backward()
            torch.nn.utils.clip_grad_norm_(New_Policy.parameters(), max_grad_norm)
            optimizer.step()    # Update policy parameters using gradient ascent
        
        
        # --- 4. Logging and Evaluation ---
        if (epoch + 1) % log_iterations == 0:
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss.item():.4f}, Ratio: {R1_ratio.mean().item():.5f}, Entropy Term: {entropy:.5f}")
            # You can add more evaluation metrics here, e.g., average reward per episode
            # For Blackjack, the reward is often -1, 0, or 1.
            avg_reward = sum(sum(ep["rewards"]) for ep in batch_trajectories) / batch_size
            print(f"Average reward per episode in batch: {avg_reward:.2f}")

    New_Policy.eval()   # Change to eval mode for evaluation


    env.close() # Close the environment after training
    print("Training complete.")
    return New_Policy # Return the trained policy

In [33]:
_ = training_blackjack_agent()

Training BlackJack Agent's Policy on cpu with 50 epochs, 0.0001 learning rate, batch size 64, and KL beta 0.01.


Epoch 1/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 607.30it/s]
Epoch 2/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 602.34it/s]
Epoch 3/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 670.68it/s]
Epoch 4/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 652.80it/s]
Epoch 5/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 672.69it/s]
Epoch 6/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 678.68it/s]
Epoch 7/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 679.65it/s]
Epoch 8/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 594.22it/s]
Epoch 9/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 732.53it/s]
Epoch 10/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 633.22it/s]
Main Epoch (Outer Loop):  20%|██        | 10/50 [00:01<00:04,  8.41it/s]

Epoch 10/50, Loss: -0.0342, Ratio: 1.01189, Entropy Term: 0.43756
Average reward per episode in batch: -0.81


Epoch 11/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 616.12it/s]
Epoch 12/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 674.66it/s]
Epoch 13/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 651.96it/s]
Epoch 14/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 712.00it/s]
Epoch 15/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 696.99it/s]
Epoch 16/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 703.84it/s]
Epoch 17/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 668.70it/s]
Epoch 18/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 657.33it/s]
Epoch 19/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 726.12it/s]
Epoch 20/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 672.24it/s]
Main Epoch (Outer Loop):  40%|████      | 20/50 [00:02<00:03,  9.18it/s]

Epoch 20/50, Loss: -0.0081, Ratio: 0.99732, Entropy Term: 0.34493
Average reward per episode in batch: -0.53


Epoch 21/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 621.21it/s]
Epoch 22/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 609.00it/s]
Epoch 23/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 629.88it/s]
Epoch 24/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 604.21it/s]
Epoch 25/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 693.77it/s]
Epoch 26/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 747.40it/s]
Epoch 27/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 594.10it/s]
Epoch 28/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 617.33it/s]
Epoch 29/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 654.89it/s]
Epoch 30/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 673.82it/s]
Main Epoch (Outer Loop):  60%|██████    | 30/50 [00:03<00:02,  8.90it/s]

Epoch 30/50, Loss: -0.0047, Ratio: 0.99824, Entropy Term: 0.25626
Average reward per episode in batch: -0.34


Epoch 31/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 645.77it/s]
Epoch 32/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 764.95it/s]
Epoch 33/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 654.60it/s]
Epoch 34/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 613.97it/s]
Epoch 35/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 654.14it/s]
Epoch 36/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 617.80it/s]
Epoch 37/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 618.29it/s]
Epoch 38/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 625.57it/s]
Epoch 39/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 652.24it/s]
Epoch 40/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 659.58it/s]
Main Epoch (Outer Loop):  80%|████████  | 40/50 [00:04<00:01,  8.76it/s]

Epoch 40/50, Loss: -0.0114, Ratio: 0.99346, Entropy Term: 0.10141
Average reward per episode in batch: 0.08


Epoch 41/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 582.63it/s]
Epoch 42/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 636.18it/s]
Epoch 43/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 590.60it/s]
Epoch 44/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 628.22it/s]
Epoch 45/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 595.57it/s]
Epoch 46/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 657.62it/s]
Epoch 47/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 638.46it/s]
Epoch 48/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 607.83it/s]
Epoch 49/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 627.26it/s]
Epoch 50/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 645.98it/s]
                                                                        

Epoch 50/50, Loss: -0.0028, Ratio: 0.99890, Entropy Term: 0.05628
Average reward per episode in batch: -0.22
Training complete.




In [21]:
_ = training_blackjack_agent(device="cuda")

Training BlackJack Agent's Policy on cuda with 50 epochs, 0.0001 learning rate, batch size 64, and KL beta 0.01.


Epoch 1/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 299.76it/s]
Epoch 2/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 352.32it/s]
Epoch 3/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 330.28it/s]
Epoch 4/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 353.04it/s]
Epoch 5/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 319.99it/s]
Epoch 6/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 324.98it/s]
Epoch 7/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 313.28it/s]
Epoch 8/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 299.54it/s]
Epoch 9/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 314.98it/s]
Epoch 10/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 311.24it/s]
Main Epoch (Outer Loop):  20%|██        | 10/50 [00:03<00:12,  3.17it/s]

Epoch 10/50, Loss: -0.0004, Ratio: 1.00033, Entropy Term: 0.01681
Average reward per episode in batch: -1.00


Epoch 11/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 307.76it/s]
Epoch 12/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 319.51it/s]
Epoch 13/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 324.89it/s]
Epoch 14/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 306.38it/s]
Epoch 15/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 319.08it/s]
Epoch 16/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 303.61it/s]
Epoch 17/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 318.14it/s]
Epoch 18/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 308.87it/s]
Epoch 19/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 288.70it/s]
Epoch 20/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 281.01it/s]
Main Epoch (Outer Loop):  40%|████      | 20/50 [00:06<00:09,  3.00it/s]

Epoch 20/50, Loss: -0.0001, Ratio: 1.00005, Entropy Term: 0.00639
Average reward per episode in batch: -1.00


Epoch 21/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 287.67it/s]
Epoch 22/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 307.63it/s]
Epoch 23/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 281.05it/s]
Epoch 24/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 286.78it/s]
Epoch 25/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 315.25it/s]
Epoch 26/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 311.94it/s]
Epoch 27/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 332.44it/s]
Epoch 28/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 313.90it/s]
Epoch 29/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 302.64it/s]
Epoch 30/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 307.65it/s]
Main Epoch (Outer Loop):  60%|██████    | 30/50 [00:09<00:06,  2.93it/s]

Epoch 30/50, Loss: -0.0001, Ratio: 1.00003, Entropy Term: 0.00348
Average reward per episode in batch: -1.00


Epoch 31/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 337.54it/s]
Epoch 32/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 292.97it/s]
Epoch 33/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 298.01it/s]
Epoch 34/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 291.45it/s]
Epoch 35/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 299.27it/s]
Epoch 36/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 282.37it/s]
Epoch 37/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 330.69it/s]
Epoch 38/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 306.17it/s]
Epoch 39/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 285.39it/s]
Epoch 40/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 324.32it/s]
Main Epoch (Outer Loop):  80%|████████  | 40/50 [00:13<00:03,  3.07it/s]

Epoch 40/50, Loss: -0.0001, Ratio: 1.00003, Entropy Term: 0.00336
Average reward per episode in batch: -1.00


Epoch 41/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 319.75it/s]
Epoch 42/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 337.10it/s]
Epoch 43/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 318.36it/s]
Epoch 44/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 338.02it/s]
Epoch 45/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 325.59it/s]
Epoch 46/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 326.61it/s]
Epoch 47/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 303.96it/s]
Epoch 48/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 347.19it/s]
Epoch 49/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 280.39it/s]
Epoch 50/50 (Inner K-Epochs): 100%|██████████| 64/64 [00:00<00:00, 314.26it/s]
                                                                        

Epoch 50/50, Loss: -0.0000, Ratio: 1.00003, Entropy Term: 0.00177
Average reward per episode in batch: -1.00
Training complete.




Training BlackJack Agent's Policy with 10 epochs, 0.0001 learning rate, batch size 4, and KL beta 0.01.
* Batch of Trajectories:
* [{'states': [(12, 10, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.1239])]}, 
* {'states': [(20, 7, 0)], 'actions': [0], 'rewards': [1.0], 'log_probs': [tensor([-0.0815])]}, 
* {'states': [(12, 1, 0), (17, 1, 0)], 'actions': [1, 1], 'rewards': [0.0, -1.0], 'log_probs': [tensor([-1.5968]), tensor([-1.9474])]}, 
* {'states': [(6, 6, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.2144])]}, 
* {'states': [(7, 4, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.2734])]}, 
* {'states': [(13, 3, 1)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.1471])]}, 
* {'states': [(15, 10, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.1000])]}, 
* {'states': [(12, 10, 0)], 'actions': [0], 'rewards': [1.0], 'log_probs': [tensor([-0.1239])]}, 
* {'states': [(14, 7, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.1320])]}]

In [16]:
# Example usage (assuming you have a way to call this function, e.g., in a main block)
if __name__ == '__main__':
    # You can adjust these parameters as needed
    # Using a larger batch_size for more stable training and to reduce empty batch issues
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")   # Device Agnostic Code
    trained_policy = training_blackjack_agent(
        epochs=2000,
        learning_rate=0.0003,
        batch_size=2048, # Significantly larger batch size recommended for stability
        k_epochs=128,
        epsilon=0.2,
        beta_kl=0.01,
        entropy_coeff=0.001,
        log_iterations=100,
        gamma=0.99,
        device=device
    )

    print("\nTesting the trained policy:")
    test_env = gym.make("Blackjack-v1", sab=True)
    total_test_rewards = 0
    num_test_episodes = 1000

    for _ in range(num_test_episodes):
        obs, _ = test_env.reset()
        done = False
        truncated = False
        episode_reward = 0
        while not done and not truncated:
            obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(device)
            with torch.no_grad():
                logits = trained_policy(obs_tensor)
                dist = torch.distributions.Categorical(logits=logits)
                action = dist.sample()
            obs, reward, done, truncated, _ = test_env.step(action.item())
            episode_reward += reward
        total_test_rewards += episode_reward

    print(f"Average reward over {num_test_episodes} test episodes: {total_test_rewards / num_test_episodes:.4f}")
    test_env.close()

Training BlackJack Agent's Policy on cuda with 2000 epochs, 0.0003 learning rate, batch size 2048, and KL beta 0.01.


Main Epoch (Outer Loop):   5%|▌         | 100/2000 [05:15<1:45:36,  3.34s/it]

Epoch 100/2000, Loss: -0.0027, Ratio: 0.99872, Entropy Term: 0.13661
Average reward per episode in batch: -0.05


                                                                             

KeyboardInterrupt: 

took 32 minutes to run using the CPU

Parameters: 


epochs=2000,
        learning_rate=0.0003,
        batch_size=2048, # Significantly larger batch size recommended for stability
        k_epochs=128,
        epsilon=0.2,
        beta_kl=0.01,
        entropy_coeff=0.001,
        log_iterations=100,
        gamma=0.99

In [None]:
test_env = gym.make("Blackjack-v1", render_mode="rgb", sab=True)
total_test_rewards = 0

NameError: name 'gym' is not defined

In [None]:
num_test_episodes = 10

In [None]:
for episode in range(num_test_episodes):
    print(f"Resetting env for episode: {episode}")
    obs, _ = test_env.reset()
    done = False
    truncated = False
    episode_reward = 0
    stored_obs=[]
    while not done and not truncated:
        obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            logits = trained_policy(obs_tensor)
            dist = torch.distributions.Categorical(logits=logits)
            action = dist.sample()
            print(f"obs_tensor: {obs_tensor} || Action taken: {action}")
        obs, reward, done, truncated, _ = test_env.step(action.item())
        episode_reward += reward
        if (truncated): print("truncated")
    print(f"Reward: {episode_reward} || Final Observation: {obs}")

Resetting env for episode: 0
obs_tensor: tensor([[13.,  2.,  0.]]) || Action taken: tensor([1])
obs_tensor: tensor([[14.,  2.,  0.]]) || Action taken: tensor([1])
Reward: -1.0 || Final Observation: (23, 2, 0)
Resetting env for episode: 1
obs_tensor: tensor([[10., 10.,  0.]]) || Action taken: tensor([1])
obs_tensor: tensor([[20., 10.,  0.]]) || Action taken: tensor([0])
Reward: 1.0 || Final Observation: (20, 10, 0)
Resetting env for episode: 2
obs_tensor: tensor([[18., 10.,  0.]]) || Action taken: tensor([0])
Reward: -1.0 || Final Observation: (18, 10, 0)
Resetting env for episode: 3
obs_tensor: tensor([[12., 10.,  0.]]) || Action taken: tensor([1])
Reward: -1.0 || Final Observation: (22, 10, 0)
Resetting env for episode: 4
obs_tensor: tensor([[21.,  9.,  1.]]) || Action taken: tensor([0])
Reward: 1.0 || Final Observation: (21, 9, 1)
Resetting env for episode: 5
obs_tensor: tensor([[19.,  2.,  0.]]) || Action taken: tensor([0])
Reward: 1.0 || Final Observation: (19, 2, 0)
Resetting env 

: 

In [None]:
env.close()

Currently the final state which reveals what the dealer ended up with in the end is not shown. By trying to access the dealer's final hand or by adding custom logging within the environment, you'll gain the critical information needed to definitively understand the why behind each reward.