In [None]:
# 1.3 Example for Exploration and Exploitation- HoaDNt@fe.edu.vn
import numpy as np

class EpsilonGreedyAgent:
    def __init__(self, num_actions, epsilon=0.1):
        self.num_actions = num_actions
        self.epsilon = epsilon
        self.action_values = np.zeros(num_actions)
        self.action_counts = np.zeros(num_actions)

    def select_action(self):
        if np.random.rand() < self.epsilon:
            # Randomly choose an action for exploration
            action = np.random.randint(self.num_actions)
        else:   
            # Choose the greedy action for exploitation
            action = np.argmax(self.action_values)
        return action

    def update_value(self, action, reward):
        self.action_counts[action] += 1
        # Update action-value estimate using incremental update rule
        self.action_values[action] += (1 / self.action_counts[action]) * (reward - self.action_values[action])

# Create a simple multi-armed bandit environment
class MultiArmedBandit:
    def __init__(self, num_arms):
        self.num_arms = num_arms
        self.true_action_values = np.random.normal(0, 1, num_arms)

    def get_reward(self, action):
        # Reward is sampled from a normal distribution with mean true action value and unit variance
        return np.random.normal(self.true_action_values[action], 1)

# Initialize the environment and agent
num_arms = 10
num_steps = 1000
agent = EpsilonGreedyAgent(num_arms)

# Interaction loop
bandit = MultiArmedBandit(num_arms)
total_rewards = 0
for step in range(num_steps):
    action = agent.select_action()
    reward = bandit.get_reward(action)
    agent.update_value(action, reward)
    total_rewards += reward

print("Total rewards obtained:", total_rewards)
print("Estimated action values:", agent.action_values)

Total rewards obtained: 1289.8346425836128
Estimated action values: [ 0.20678417  0.31561663 -0.45606796  0.50332794  1.04339528  1.51167698
 -1.00074427 -0.66690294  0.05755201 -2.80537135]
