In [2]:
import numpy as np

In [3]:
class EpsilonGreedyBandit:
    def __init__(self, items, epsilon=0.1):
        self.items = items
        self.k = len(items)
        self.epsilon = epsilon
        self.action_values = np.zeros(self.k)
        self.action_counts = np.zeros(self.k)

    def select_action(self):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.k)
        else:
            return np.argmax(self.action_values)

    def update(self, action, reward):
        self.action_counts[action] += 1
        n = self.action_counts[action]
        self.action_values[action] += (reward - self.action_values[action]) / n

    def recommend(self):
        action = self.select_action()
        return self.items[action], action

In [4]:
class UserSimulator:
    """Simulates user behavior based on preferences for specific fruits."""

    def __init__(self, true_preferences):
        self.true_preferences = true_preferences

    def respond_to_recommendation(self, item_index):
        like_probability = self.true_preferences[item_index]
        return np.random.binomial(1, like_probability)

In [5]:
fruits = ["Apple", "Banana", "Orange", "Grape", "Mango"]
true_preferences = [0.5, 0.7, 0.9, 0.4, 0.8]

In [6]:
bandit = EpsilonGreedyBandit(items=fruits, epsilon=0.1)
user_simulator = UserSimulator(true_preferences=true_preferences)

In [7]:
for _ in range(1000):
    recommended_fruit, action = bandit.recommend()
    reward = user_simulator.respond_to_recommendation(action)
    bandit.update(action, reward)
    print(f"Recommended: {recommended_fruit}, Reward: {reward}")

Recommended: Apple, Reward: 1
Recommended: Apple, Reward: 0
Recommended: Apple, Reward: 1
Recommended: Apple, Reward: 0
Recommended: Apple, Reward: 1
Recommended: Apple, Reward: 0
Recommended: Apple, Reward: 1
Recommended: Apple, Reward: 0
Recommended: Apple, Reward: 1
Recommended: Apple, Reward: 0
Recommended: Apple, Reward: 0
Recommended: Banana, Reward: 1
Recommended: Banana, Reward: 1
Recommended: Grape, Reward: 0
Recommended: Banana, Reward: 1
Recommended: Banana, Reward: 1
Recommended: Banana, Reward: 0
Recommended: Banana, Reward: 0
Recommended: Grape, Reward: 0
Recommended: Banana, Reward: 1
Recommended: Banana, Reward: 1
Recommended: Banana, Reward: 1
Recommended: Banana, Reward: 1
Recommended: Banana, Reward: 0
Recommended: Banana, Reward: 0
Recommended: Banana, Reward: 1
Recommended: Banana, Reward: 0
Recommended: Banana, Reward: 1
Recommended: Banana, Reward: 0
Recommended: Banana, Reward: 1
Recommended: Banana, Reward: 0
Recommended: Banana, Reward: 1
Recommended: Banana, 

In [8]:
print("\nFinal estimated values for each fruit:")
for fruit, value in zip(fruits, bandit.action_values):
    print(f"{fruit}: {value:.3f}")


Final estimated values for each fruit:
Apple: 0.357
Banana: 0.740
Orange: 0.901
Grape: 0.360
Mango: 0.810


In [9]:
print("\nNumber of times each fruit was recommended:")
for fruit, count in zip(fruits, bandit.action_counts):
    print(f"{fruit}: {count}")


Number of times each fruit was recommended:
Apple: 42.0
Banana: 73.0
Orange: 839.0
Grape: 25.0
Mango: 21.0
