**Part 1**

* Attempting to use Softmax (Categorical Distribution) implementation instead of Sigmoid (Binary Bernoulli Distribution)

# Imports

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
# import random
import numpy as np
from tqdm import tqdm
import gymnasium as gym

# Testing

In [2]:
env = gym.make("Blackjack-v1", sab=True) # # `sab=True` uses the Sutton & Barto version

In [None]:
# Reset the Environment, and get an observation
obs, _ = env.reset()

Observation Space
* player_sum: The sum of the player's cards (integer between 4 and 21+).
* dealer_card: The value of the dealer's visible card (1–10).
* usable_ace: True if the player has a usable ace (counts as 11), otherwise False.

In [10]:
print(obs)

(20, 10, 0)


In [5]:
done = False
while not done:
    action = env.action_space.sample()
    obs, reward, done, truncated, info = env.step(action)

* obs: New observation after the action.
* reward: Final reward: +1 for win, 0 for draw, -1 for loss.
* done: Whether the episode has ended.
* truncated: Whether the episode was truncated (usually False here).
* info: Extra info (often empty in Blackjack).

In [6]:
print(reward)

-1.0


In [11]:
env.action_space

Discrete(2)

The Blackjack action space is Discrete(2):
* 0 = Stick
* 1 = Hit

# Agent

In [7]:
class BlackJackAgent(nn.Module):
    def __init__(self, obs_size=3, hidden_size=10, output_size=2):
        super(BlackJackAgent, self).__init__()
        self.layer_1 = nn.Linear(obs_size, hidden_size)
        self.layer_2 = nn.Linear(hidden_size, output_size)
        self.action_probs_activation_layer = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = torch.relu(self.layer_1(x))
        logits = self.layer_2(x)
        return logits       # later use nn.Softmax to get probabilities

    def get_action_probs(self, logits):
        """Get the probabilities of each action."""
        return self.action_probs_activation_layer(logits)
    
    def sample_action(self, action:None):
        """Get the probability of choosing the action"""
        logits = self.forward(action)
        probs = self.get_action_probs(logits)
        dist = torch.distributions.Categorical(probs=probs)
        action = dist.sample().item()
        prob_of_action = dist.log_prob(action)
        return action, prob_of_action

# Training Loop

In [22]:
def training_blackjack_agent(epochs=1000, learning_rate=0.0001, batch_size=64, gamma=0.99, k_epochs=64, epsilon=0.2, beta_kl=0.01, max_grad_norm=0.5, entropy_coeff=0.01, log_iterations=10) -> BlackJackAgent: 
    print(f"Training BlackJack Agent's Policy with {epochs} epochs, {learning_rate} learning rate, batch size {batch_size}, and KL beta {beta_kl}.")
    env = gym.make("Blackjack-v1", sab=True) # # `sab=True` uses the Sutton & Barto version
    New_Policy = BlackJackAgent()   # STEP 3 || 
    optimizer = optim.Adam(params=New_Policy.parameters(), lr=learning_rate)
    # num_correct = 0.0

    for epoch in range(epochs):     # STEP 4 || 
        # STEP 3 || CREATE REFERENCE MODEL OMITTED
        batch_trajectories = []     # Will contain a batch of trajectories

        # STEP 5 || Sample a batch D_b from D --> OMITTED 
        # STEP 6 || Update the old policy model PI old <- PI new
        Policy_Old = BlackJackAgent()
        Policy_Old.load_state_dict(New_Policy.state_dict())
        Policy_Old.eval()   # Prevent Gradient tracking

        # --- STEP 7 Collect a Batch of Experiences ---
        # Loop Agent prediction, recording trajectories to lists:
        for i in range(batch_size):
            
            # Create local trajectory library
            episode_trajectory = {"states": [], "actions": [], "rewards": [], "log_probs": []}
            obs, _ = env.reset()
            done, truncated = False, False
            while not done and not truncated:
                obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0) # add batch dim to feed to NN
                logits = Policy_Old(obs_tensor)
                dist = torch.distributions.Categorical(logits=logits)
                action = dist.sample() # Tensor of shape [1]
                log_prob = dist.log_prob(action)
                next_obs, reward, done, truncated, info = env.step(action.item())

                # Store episode_Trajectory
                episode_trajectory["states"].append(obs)
                episode_trajectory["actions"].append(action.item())
                episode_trajectory["rewards"].append(reward)
                episode_trajectory["log_probs"].append(log_prob)
                
                obs = next_obs  # Update the observation

            batch_trajectories.append(episode_trajectory)

            print(f"Batch of Trajectories at current epoch:{epoch}:\n{batch_trajectories}")


            # These lists will hold data from ALL episodes in the current batch for Advantage Calculation
            all_states = []
            all_actions = []
            all_old_log_probs = []
            all_discounted_rewards = []

            # STEP 8 || Calculate Discounted Rewards
            for episode_trajectory in batch_trajectories:
                rewards = episode_trajectory["rewards"]
                states = episode_trajectory["states"]
                actions = episode_trajectory["actions"]
                log_probs = episode_trajectory["log_probs"]
                
                discounted_reward = 0
                rewards = []
                for reward in reversed(rewards):
                    discounted_reward = reward + gamma * discounted_reward
                    rewards.insert(0, discounted_reward)
                discounted_rewards = torch.tensor(rewards, dtype=torch.float32)

                # Add each trajectory information for the batch
                all_states.extend(states)
                all_actions.extend(actions)
                all_old_log_probs.extend(log_probs)
                all_discounted_rewards.extend(discounted_rewards.tolist())

            # Convert all collected batch data into PyTorch tensors
            all_states_tensor = torch.tensor(all_states, dtype=torch.float32)
            all_actions_tensor = torch.tensor(all_actions, dtype=torch.long)
            # Stack individual log_prob tensors and then flatten if necessary
            all_old_log_probs_tensor = torch.cat(all_old_log_probs).squeeze(-1) # Ensure it's a 1D tensor
            all_discounted_rewards_tensor = torch.tensor(all_discounted_rewards, dtype=torch.float32)

            # STEP 9 || Calculate the Advantage of each Time Step for each Trajectory using normalization
            all_advantages_tensor = (all_discounted_rewards_tensor - all_discounted_rewards_tensor.mean()) / (all_discounted_rewards_tensor.std() + 1e-8)

            # --- STEP 10 || GRPO Optimization ---
            for k_epoch in tqdm(range(k_epochs), desc=f"Epoch {epoch+1}/{epochs} (Inner K-Epochs)", leave=False):
                new_logits = New_Policy(all_states_tensor)
                new_dist = torch.distributions.Categorical(logits=new_logits)
                new_log_probs = new_dist.log_prob(all_actions_tensor)
                entropy = new_dist.entropy().mean() # Calculate entropy for regularization

                R1_ratio = torch.exp(new_log_probs - all_old_log_probs_tensor)

                unclipped_surrogate = R1_ratio * all_advantages_tensor
                clipped_surrogate = torch.clamp(input=R1_ratio, min=1.0-epsilon, max=1.0+epsilon) * all_advantages_tensor

                policy_loss = -torch.min(unclipped_surrogate, clipped_surrogate).mean()

                # --- KL Divergence Calculation ---
                # Create distributions for old policies using the trajectory states
                with torch.no_grad():
                    old_logits = Policy_Old(all_states_tensor)
                old_dist = torch.distributions.Categorical(logits=old_logits)

                # Calculate KL divergence per sample, then take the mean over the batch
                kl_div_per_sample = torch.distributions.kl.kl_divergence(p=new_dist, q=old_dist)
                kl_loss = kl_div_per_sample.mean() # Mean over the batch

                # Total Loss for GRPO
                total_loss = policy_loss + beta_kl * kl_loss - entropy_coeff * entropy

                # STEP 11 || Policy Updates
                optimizer.zero_grad()
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(New_Policy.parameters(), max_grad_norm)
                optimizer.step()    # Update policy parameters using gradient ascent
            
            
            # --- 4. Logging and Evaluation ---
            if (epoch + 1) % log_iterations == 0:
                print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss.item():.4f}, Ratio: {R1_ratio:.5f}, Entropy Term: {entropy:.5f}")
                # You can add more evaluation metrics here, e.g., average reward per episode
                # For Blackjack, the reward is often -1, 0, or 1.
                avg_reward = sum(sum(ep["rewards"]) for ep in batch_trajectories) / batch_size
                print(f"Average reward per episode in batch: {avg_reward:.2f}")

        New_Policy.eval()


    env.close() # Close the environment after training
    print("Training complete.")
    return New_Policy # Return the trained policy

In [23]:
_ = training_blackjack_agent()

  all_advantages_tensor = (all_discounted_rewards_tensor - all_discounted_rewards_tensor.mean()) / (all_discounted_rewards_tensor.std() + 1e-8)


Training BlackJack Agent's Policy with 1000 epochs, 0.0001 learning rate, batch size 64, and KL beta 0.01.
Batch of Trajectories at current epoch:0:
[{'states': [(14, 4, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.0601], grad_fn=<SqueezeBackward1>)]}]


                                                                     

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

Training BlackJack Agent's Policy with 10 epochs, 0.0001 learning rate, batch size 4, and KL beta 0.01.
* Batch of Trajectories:
* [{'states': [(12, 10, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.1239])]}, 
* {'states': [(20, 7, 0)], 'actions': [0], 'rewards': [1.0], 'log_probs': [tensor([-0.0815])]}, 
* {'states': [(12, 1, 0), (17, 1, 0)], 'actions': [1, 1], 'rewards': [0.0, -1.0], 'log_probs': [tensor([-1.5968]), tensor([-1.9474])]}, 
* {'states': [(6, 6, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.2144])]}, 
* {'states': [(7, 4, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.2734])]}, 
* {'states': [(13, 3, 1)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.1471])]}, 
* {'states': [(15, 10, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.1000])]}, 
* {'states': [(12, 10, 0)], 'actions': [0], 'rewards': [1.0], 'log_probs': [tensor([-0.1239])]}, 
* {'states': [(14, 7, 0)], 'actions': [0], 'rewards': [-1.0], 'log_probs': [tensor([-0.1320])]}]