In [1]:
import numpy as np

# Parameters
actions = ['A', 'B', 'C']
H = np.zeros(len(actions))          # Preferences
alpha = 0.1                         # Learning rate
average_reward = 0
reward_count = 0

# Simulated true rewards for actions
true_rewards = [1.0, 0.5, 0.2]

# Softmax function
def softmax(H):
    eH = np.exp(H)
    return eH / np.sum(eH)

# Run Gradient Bandit for 10 steps
for step in range(10):
    probs = softmax(H)
    action_index = np.random.choice(len(actions), p=probs)

    # Get reward
    reward = np.random.normal(true_rewards[action_index], 0.1)

    # Update average reward
    reward_count += 1
    average_reward += (reward - average_reward) / reward_count

    # Update preferences
    for i in range(len(actions)):
        if i == action_index:
            H[i] += alpha * (reward - average_reward) * (1 - probs[i])
        else:
            H[i] -= alpha * (reward - average_reward) * probs[i]

    print(f"Step {step+1}: Action {actions[action_index]}, Reward {reward:.2f}, H = {H.round(2)}, P = {softmax(H).round(2)}")

Step 1: Action B, Reward 0.50, H = [0. 0. 0.], P = [0.33 0.33 0.33]
Step 2: Action C, Reward 0.39, H = [ 0.  0. -0.], P = [0.33 0.33 0.33]
Step 3: Action C, Reward 0.23, H = [ 0.01  0.01 -0.01], P = [0.34 0.34 0.33]
Step 4: Action C, Reward 0.15, H = [ 0.01  0.01 -0.02], P = [0.34 0.34 0.33]
Step 5: Action C, Reward 0.01, H = [ 0.02  0.02 -0.04], P = [0.34 0.34 0.32]
Step 6: Action C, Reward 0.33, H = [ 0.02  0.02 -0.04], P = [0.34 0.34 0.32]
Step 7: Action A, Reward 0.95, H = [ 0.06 -0.   -0.06], P = [0.35 0.33 0.31]
Step 8: Action B, Reward 0.58, H = [ 0.05  0.01 -0.06], P = [0.35 0.34 0.31]
Step 9: Action C, Reward 0.22, H = [ 0.06  0.02 -0.07], P = [0.35 0.34 0.31]
Step 10: Action A, Reward 0.95, H = [ 0.09 -0.   -0.09], P = [0.36 0.33 0.3 ]
