## Actor Critic Reinforcement Learning Method
https://keras.io/examples/rl/actor_critic_cartpole/

In [10]:
# Setup
import keras
import tensorflow as tf
from keras import layers
import numpy as np
from blackjack_env import BlackjackEnv

# Custom blackjack environment
env = BlackjackEnv()

# Configuration parameters for the whole setup
gamma = 0.99  # Discount factor for past rewards
num_inputs = 2 # Player hand value, Dealer's visible card
num_actions = 2 # Hit or stay
num_hidden = 128

In [11]:
# Define the Actor-Critic Model
inputs = layers.Input(shape=(num_inputs,))
common = layers.Dense(num_hidden, activation="relu")(inputs)
action = layers.Dense(num_actions, activation="softmax")(common)
critic = layers.Dense(1)(common)

# Implement Actor Critic network
model = keras.Model(inputs=inputs, outputs=[action, critic])
optimizer = keras.optimizers.Adam(learning_rate=0.001)
huber_loss = keras.losses.Huber()

In [12]:
# Training Loop
running_reward = 0
episode_count = 0

MAX_HANDS = 1000 # Stops after 1,000 episodes
epsilon = 0.1 # Exploration factor

# Track actor and critic losses
actor_losses_history = []
critic_losses_history = []

for hand_count in range(MAX_HANDS):
    state = env.reset()
    episode_reward = 0
    action_probs_history, critic_value_history, rewards_history = [], [], []

    with tf.GradientTape() as tape:
        for _ in range(100):  # Play rounds
            state_tensor = tf.convert_to_tensor(state)
            state_tensor = tf.expand_dims(state_tensor, 0)

            # Predict action probabilities and estimated future rewards
            # from environment state
            action_probs, critic_value = model(state_tensor)
            critic_value_history.append(critic_value[0, 0])

            # Epsilon-greedy action selection
            if np.random.rand() < epsilon:
                action = np.random.choice(num_actions) # Explore
            else:
                action = np.argmax(action_probs) # Exploit

            # Choose an action based on probabilities
            # action = np.random.choice(num_actions, p=np.squeeze(action_probs))
            action_probs_history.append(tf.math.log(action_probs[0, action]))

            # Apply the action in Blackjack
            state, reward, done = env.step(action)
            rewards_history.append(reward)
            episode_reward += reward

            if done:
                break # End episode

        # Calculate expected value from rewards
        # - At each timestep what was the total reward received after that timestep
        # - Rewards in the past are discounted by multiplying them with gamma
        # - These are the labels for our critic
        returns = []
        discounted_sum = 0
        for r in reversed(rewards_history):
            discounted_sum = r + gamma * discounted_sum
            returns.insert(0, discounted_sum)

        # Normalize returns
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + 1e-7)

        # Compute Loss
        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses, critic_losses = [], []
        for log_prob, value, ret in history:
            # At this point in history, the critic estimated that we would get a
            # total reward = `value` in the future. We took an action with log probability
            # of `log_prob` and ended up receiving a total reward = `ret`.
            # The actor must be updated so that it predicts an action that leads to
            # high rewards (compared to critic's estimate) with high probability.
            diff = ret - value
            actor_losses.append(-log_prob * diff)  # Actor loss

            # The critic must be updated so that it predicts a better estimate of
            # the future rewards.
            critic_losses.append(huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0)))

        # Backpropagation
        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # Track losses for averaging after training
    actor_losses_history.append(np.mean(actor_losses))
    critic_losses_history.append(np.mean(critic_losses))

    # Track performance
    running_reward += episode_reward
    #running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward # Higher value - model reacts faster to changes / Lower value - smoother trend but slower to update
    episode_count += 1

    if episode_count % 50 == 0 or episode_count == 1:
        print(f"Episode {hand_count + 1}: Running Reward: {running_reward / episode_count:.2f}")

    if running_reward / episode_count > 5:  # Stop when the AI consistently wins
        print(f"Solved in {hand_count + 1} episodes!")
        break

# Display average loss after episodes complete
average_actor_loss = np.mean(actor_losses_history)
average_critic_loss = np.mean(critic_losses_history)

print(f"Average Actor Loss: {average_actor_loss:.4f}")
print(f"Average Critic Loss: {average_critic_loss:.4f}")

Episode 1: Running Reward: -1.00
Episode 50: Running Reward: -0.04
Episode 100: Running Reward: -0.19
Episode 150: Running Reward: -0.25
Episode 200: Running Reward: -0.26
Episode 250: Running Reward: -0.26
Episode 300: Running Reward: -0.30
Episode 350: Running Reward: -0.31
Episode 400: Running Reward: -0.33
Episode 450: Running Reward: -0.29
Episode 500: Running Reward: -0.30
Episode 550: Running Reward: -0.29
Episode 600: Running Reward: -0.28
Episode 650: Running Reward: -0.29
Episode 700: Running Reward: -0.29
Episode 750: Running Reward: -0.29
Episode 800: Running Reward: -0.29
Episode 850: Running Reward: -0.30
Episode 900: Running Reward: -0.30
Episode 950: Running Reward: -0.30
Episode 1000: Running Reward: -0.30
Average Actor Loss: -0.3249
Average Critic Loss: 0.2976
