In [45]:
!pip install numpy

import numpy as np



## Bandit Environment
### Call Bandit(min, max) to create a bandit with range or rewards from min to max
### Call Bandit.sample() to sample from this distribution

In [46]:
#Bandit instance, used as environment
class Bandit:
    def __init__(self, min_val, max_val):
        self.times_chosen = 1
        self.dist_range = sorted(np.random.randint(min_val, max_val+1, 2))
        self.true_reward = (self.dist_range[0] + self.dist_range[1]) / 2
        print(self.dist_range)

    def sample(self):
        self.times_chosen += 1
        return np.random.uniform(self.dist_range[0], self.dist_range[1])

## Our Agent needs to:
###    a) Keep track of how "good" each bandit has been in terms of reward
###    b) Keep track of regret

In [47]:
class Agent:
    def __init__(self, num_bandits=10, dist_min=1, dist_max=10):
        self.num_bandits = num_bandits
        self.bandits = []
        self.rewards_avg = []
        self.best_reward = -1
        self.regret = 0
        self.time = 1
        for i in range(self.num_bandits):
            self.bandits.append(Bandit(dist_min, dist_max))
            self.rewards_avg.append(1)
            if self.bandits[i].true_reward > self.best_reward:
                self.best_reward = self.bandits[i].true_reward

# Greedy Agent
## The Greedy Agent chooses the best action at every time (pure exploitation). Is this what we want? What are the advantages and disadvantages of this?

In [48]:
class GreedyAgent(Agent):
    def __init__(self):
        super().__init__()

    def choose_bandit(self):
        #Choose best action
        bandit_idx = np.argmax(self.rewards_avg)
        reward = self.bandits[bandit_idx].sample()

        #Calculate new reward average
        new_reward_avg = ((self.rewards_avg[bandit_idx] * (self.bandits[bandit_idx].times_chosen-1)) + reward) / self.bandits[bandit_idx].times_chosen
        self.rewards_avg[bandit_idx] = new_reward_avg
        #Calculate regret
        self.regret += (self.best_reward - self.bandits[bandit_idx].true_reward)
        print("GREEDY: Bandit {} was chosen ({}, {}), with reward {}, and regret {}, best reward {}".format(bandit_idx, self.bandits[bandit_idx].dist_range[0], self.bandits[bandit_idx].dist_range[1], reward, self.best_reward - self.bandits[bandit_idx].true_reward, self.best_reward))

# e-Greedy Agent
## The Epsilon Greedy Agent chooses the best action most of the time (exploitation) but every so often chooses a random action (exploration). Why might this work? What are the advantages and disadvantages of this?

In [49]:
class EpsilonGreedyAgent(Agent):
    def __init__(self, epsilon = 0.9):
        super().__init__()
        self.epsilon = epsilon

    def choose_bandit(self):
        if np.random.uniform() > self.epsilon: # Choose random action
            bandit_idx = np.random.randint(0, self.num_bandits)
        else: # Choose best action
            bandit_idx = np.argmax(self.rewards_avg)
        reward = self.bandits[bandit_idx].sample()
        #Calculate new reward average
        new_reward_avg = ((self.rewards_avg[bandit_idx] * (self.bandits[bandit_idx].times_chosen-1)) + reward) / self.bandits[bandit_idx].times_chosen
        self.rewards_avg[bandit_idx] = new_reward_avg
        #Calculate regret
        self.regret += (self.best_reward - self.bandits[bandit_idx].true_reward)
        print("EPSILON GREEDY: Bandit {} was chosen ({}, {}), with reward {}, and regret {}, best reward {}".format(bandit_idx, self.bandits[bandit_idx].dist_range[0], self.bandits[bandit_idx].dist_range[1], reward, self.best_reward - self.bandits[bandit_idx].true_reward, self.best_reward))

# UCB Agent
## The Upper Confidence Bound Agent chooses the agent that has the maximum best reward + least confidence. This means that actions chosen long ago are more uncertain on their reward than actions chosen recently. We assign a "confidence" to these actions (based on our confidence on how good/bad they actually were), and then choose our action based on the reward we think they give, plus the confidence

In [69]:
class UCBAgent(Agent):
    def __init__(self, confidence=2):
        super().__init__()
        self.confidence = confidence
        self.times_since_last_choice = [1 for i in range(self.num_bandits)]

    def choose_bandit(self):
        confidences = []
        for bandit_idx in range(len(self.bandits)):
            est_reward = self.rewards_avg[bandit_idx]
            confidence_val = self.confidence * np.sqrt(np.log(self.time)/self.bandits[bandit_idx].times_chosen)
            confidences.append(est_reward + confidence_val)
        self.time += 1
        bandit_idx = np.argmax(confidences)
        reward = self.bandits[bandit_idx].sample()
        #Calculate new reward average
        new_reward_avg = ((self.rewards_avg[bandit_idx] * (self.bandits[bandit_idx].times_chosen-1)) + reward) / self.bandits[bandit_idx].times_chosen
        self.rewards_avg[bandit_idx] = new_reward_avg
        #Calculate regret
        self.regret += (self.best_reward - self.bandits[bandit_idx].true_reward)
        print("UCB: Bandit {} was chosen ({}, {}), with reward {}, and regret {}, best reward {}".format(bandit_idx, self.bandits[bandit_idx].dist_range[0], self.bandits[bandit_idx].dist_range[1], reward, self.best_reward - self.bandits[bandit_idx].true_reward, self.best_reward))


In [71]:
if __name__ == "__main__":
    agent = UCBAgent()
    for i in range(100):
        agent.choose_bandit()
    print("Final regret: {}".format(agent.regret))

[6, 9]
[3, 4]
[1, 2]
[2, 10]
[3, 9]
[4, 9]
[6, 9]
[1, 2]
[2, 5]
[5, 9]
UCB: Bandit 0 was chosen (6, 9), with reward 8.363421804180451, and regret 0.0, best reward 7.5
UCB: Bandit 0 was chosen (6, 9), with reward 8.902138847560177, and regret 0.0, best reward 7.5
UCB: Bandit 0 was chosen (6, 9), with reward 7.943536455285491, and regret 0.0, best reward 7.5
UCB: Bandit 0 was chosen (6, 9), with reward 8.818148931700662, and regret 0.0, best reward 7.5
UCB: Bandit 0 was chosen (6, 9), with reward 8.56132505661118, and regret 0.0, best reward 7.5
UCB: Bandit 0 was chosen (6, 9), with reward 8.407015667720913, and regret 0.0, best reward 7.5
UCB: Bandit 0 was chosen (6, 9), with reward 8.633426286691158, and regret 0.0, best reward 7.5
UCB: Bandit 0 was chosen (6, 9), with reward 7.4082958493890985, and regret 0.0, best reward 7.5
UCB: Bandit 0 was chosen (6, 9), with reward 8.335732610216187, and regret 0.0, best reward 7.5
UCB: Bandit 0 was chosen (6, 9), with reward 6.26631354100471, an