In [7]:
import numpy as np

In [8]:
class BanditEnvironment:
    def __init__(self, probabilities):
        self.probabilities = probabilities
        self.num_arms = len(probabilities)
    def pull(self, arm):
        return 1 if np.random.rand() < self.probabilities[arm] else 0

In [9]:
class EpsilonGreedy:
    def __init__(self, num_arms, epsilon=0.1):
        self.num_arms = num_arms
        self.epsilon = epsilon
        self.counts = np.zeros(num_arms)
        self.values = np.zeros(num_arms)

    def select_arm(self):
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.num_arms)
        else:
            return int(np.argmax(self.values))

    def update(self, arm, reward):
        self.counts[arm] += 1
        n = self.counts[arm]
        old_value = self.values[arm]
        new_value = old_value + (1.0 / n) * (reward - old_value)
        self.values[arm] = new_value

In [10]:
def simulate(env,agent,num_pulls):
    rewards = np.zeros(num_pulls)
    for i in range(num_pulls):
        arm = agent.select_arm()
        reward = env.pull(arm)
        agent.update(arm,reward)
        rewards[i]=reward
    return rewards

In [11]:
probabilities = [0.1, 0.5, 0.8]
env = BanditEnvironment(probabilities)
epsilon = 0.1
agent = EpsilonGreedy(num_arms=env.num_arms, epsilon=epsilon)
num_pulls = 1000
rewards = simulate(env, agent, num_pulls)
print(f"Total Rewards after {num_pulls} pulls: {int(np.sum(rewards))}")
print('Estimated values:', agent.values)
print('Number of times each arm was pulled:', agent.counts)

Total Rewards after 1000 pulls: 764
Estimated values: [0.04761905 0.36111111 0.81236443]
Number of times each arm was pulled: [ 42.  36. 922.]
