In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
class KArmedBandit:
    def __init__(self, k=10, epsilon=0.1):
        self.k = k  # Number of arms
        self.epsilon = epsilon  # Exploration rate
        self.q_star = np.random.normal(0, 1, k)  # True action values (q*)
        self.q_estimates = np.zeros(k)  # Estimated action values (Q(a))
        self.action_counts = np.zeros(k)  # Number of times each action was taken

    def select_action(self):
        """Select an action using epsilon-greedy strategy."""
        if np.random.rand() < self.epsilon:
            # Explore: Randomly choose one of the k actions
            return np.random.randint(self.k)
        else:
            # Exploit: Choose the action with the highest estimated value
            return np.argmax(self.q_estimates)

    def get_reward(self, action):
        """Generate a reward for the selected action."""
        # Reward is drawn from a normal distribution centered at the true value of the action
        return np.random.normal(self.q_star[action], 1)

    def update_estimates(self, action, reward):
        """Update the estimated value of the selected action."""
        self.action_counts[action] += 1
        # Incremental update of the estimated action value (Q(a))
        self.q_estimates[action] += (reward - self.q_estimates[action]) / self.action_counts[action]

    def run(self, steps=1000):
        """Run the epsilon-greedy algorithm for a specified number of steps."""
        rewards = np.zeros(steps)
        average_rewards = np.zeros(steps)
        total_reward = 0

        for step in range(steps):
            action = self.select_action()
            reward = self.get_reward(action)
            self.update_estimates(action, reward)
            rewards[step] = reward
            total_reward += reward
            average_rewards[step] = total_reward / (step + 1)

        return average_rewards

# Parameters
k = 10
epsilon = 0.1
steps = 1000

# Initialize the k-armed bandit
bandit = KArmedBandit(k=k, epsilon=epsilon)

# Run the algorithm and collect average rewards
average_rewards = bandit.run(steps=steps)

# Plotting the results
plt.figure(figsize=(12, 8))
plt.plot(average_rewards, label=f'Epsilon = {epsilon}')
plt.xlabel('Steps')
plt.ylabel('Average Reward')
plt.title('Epsilon-Greedy: Average Reward vs. Steps')
plt.legend()
plt.grid(True)
plt.show()