In [1]:
import numpy as np
import random

class MultiArmedBandit:
    def __init__(self, num_arms):
        """
        Initialize the MAB problem with a set of arms.

        :param num_arms: The number of arms (options) to choose from
        """
        self.num_arms = num_arms
        self.rewards = [np.random.normal(loc=0, scale=1) for _ in range(num_arms)]  # reward distributions for each arm
        self.arm_rewards = [0.0] * num_arms
        self.arm_counts = [0] * num_arms

    def choose_arm(self, epsilon):
        """
        Choose an arm to pull based on the epsilon-greedy policy.

        :param epsilon: The probability of choosing a random arm
        :return: The chosen arm
        """
        if random.random() < epsilon:
            return random.randint(0, self.num_arms - 1)
        else:
            return np.argmax(self.arm_rewards)

    def pull_arm(self, arm):
        """
        Pull the chosen arm and receive a reward.

        :param arm: The chosen arm
        :return: The reward received
        """
        reward = np.random.normal(loc=self.rewards[arm], scale=1)  # generate a random reward from the arm's distribution
        self.arm_rewards[arm] = (self.arm_rewards[arm] * self.arm_counts[arm] + reward) / (self.arm_counts[arm] + 1)
        self.arm_counts[arm] += 1
        return reward

    def run_experiment(self, num_pulls, epsilon):
        """
        Run the MAB experiment for a given number of pulls.

        :param num_pulls: The number of pulls to perform
        :param epsilon: The probability of choosing a random arm
        :return: The total reward received
        """
        total_reward = 0
        for _ in range(num_pulls):
            arm = self.choose_arm(epsilon)
            reward = self.pull_arm(arm)
            total_reward += reward
        return total_reward

# Example usage:
num_arms = 5
mab = MultiArmedBandit(num_arms)
total_reward = mab.run_experiment(1000, epsilon=0.1)
print(f"Total reward: {total_reward:.2f}")

Total reward: 1879.43
