**Part 1**

* Attempting to use Softmax (Categorical Distribution) implementation instead of Sigmoid (Binary Bernoulli Distribution)

# Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
# import random
import numpy as np
from tqdm import tqdm
import gymnasium as gym

# Testing

In [2]:
env = gym.make("Blackjack-v1", sab=True) # # `sab=True` uses the Sutton & Barto version

In [None]:
# Reset the Environment, and get an observation
obs, _ = env.reset()

Observation Space
* player_sum: The sum of the player's cards (integer between 4 and 21+).
* dealer_card: The value of the dealer's visible card (1–10).
* usable_ace: True if the player has a usable ace (counts as 11), otherwise False.

In [10]:
print(obs)

(20, 10, 0)


In [5]:
done = False
while not done:
    action = env.action_space.sample()
    obs, reward, done, truncated, info = env.step(action)

* obs: New observation after the action.
* reward: Final reward: +1 for win, 0 for draw, -1 for loss.
* done: Whether the episode has ended.
* truncated: Whether the episode was truncated (usually False here).
* info: Extra info (often empty in Blackjack).

In [6]:
print(reward)

-1.0


In [11]:
env.action_space

Discrete(2)

The Blackjack action space is Discrete(2):
* 0 = Stick
* 1 = Hit

# Agent

In [None]:
class BlackJackAgent(nn.Module):
    def __init__(self, obs_size=3, hidden_size=10, output_size=2):
        super(BlackJackAgent, self).__init()
        self.layer_1 = nn.Linear(obs_size, hidden_size)
        self.layer_2 = nn.Linear(hidden_size, output_size)
        self.action_probs_activation_layer = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = torch.relu(self.layer_1(x))
        logits = self.layer_2(x)
        return logits       # later use nn.Softmax to get probabilities

    def get_action_probs(self, logits):
        """Get the probabilities of each action."""
        return self.action_probs_activation_layer(logits)
    
    def sample_action(self, action:None):
        """Get the probability of choosing the action"""
        logits = self.forward(action)
        probs = self.get_action_probs(logits)
        dist = torch.distributions.Categorical(probs=probs)
        action = dist.sample().item()
        prob_of_action = dist.log_prob(action)
        return action, prob_of_action

# Training Loop

In [None]:
def training_blackjack_agent(epochs=100, learning_rate=0.0001, batch_size=64, k_epochs=64, epsilon=0.2, beta_kl=0.01, max_grad_norm=0.5, entropy_coeff=0.5, log_iterations=10):
    print(f"Training BlackJack Agent's Policy with {epochs} epochs, {learning_rate} learning rate, batch size {batch_size}, and KL beta {beta_kl}.")
    env = gym.make("Blackjack-v1", sab=True) # # `sab=True` uses the Sutton & Barto version
    New_Policy = BlackJackAgent()
    optimizer = optim.Adam(params=New_Policy, lr=learning_rate)
    num_correct = 0.0
    trajectories = []

    for epoch in range(epochs):
        # Create local trajectory
        trajectory = {"states": [], "actions": [], "rewards": [], "log_probs": []}

        # --- 1. Collect a Batch of Experiences ---
        # Loop Agent prediction, recording trajectories to lists:
        for i in range(batch_size):

            obs, _ = env.reset()
            done = False
            while not done:
                obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0) # add batch dim to feed to NN
                with torch.no_grad():
                    logits = New_Policy(obs_tensor)
                    dist = torch.distributions.Categorical(logits=logits)
                    action = dist.sample() # Tensor of shape [1]
                    log_prob = dist.log_prob(action)
                next_obs, reward, done, truncated, info = env.step(action.item())

                # Store Trajectory
                trajectory["states"].append(obs)
                trajectory["actions"].append(action.item())
                trajectory["rewards"].append(reward)
                trajectory["log_probs"].append(log_prob)

            trajectories.append(trajectory)


