In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Define the actions 

def action_0():
    return np.random.choice([1,0], p=[0.5,0.5])

def action_1():
    return np.random.choice([1, 0], p=[0.6, 0.4])

def action_2():
    return np.random.choice([1, 0], p=[0.2, 0.8])

rewards = [action_0, action_1, action_2]

In [4]:
#do action 0 for 10 times
for i in range(10):
    print('Pull %d (action_0): reward=%d' % (i, rewards[0]()))


Pull 0 (action_0): reward=1
Pull 1 (action_0): reward=0
Pull 2 (action_0): reward=0
Pull 3 (action_0): reward=0
Pull 4 (action_0): reward=0
Pull 5 (action_0): reward=0
Pull 6 (action_0): reward=0
Pull 7 (action_0): reward=1
Pull 8 (action_0): reward=0
Pull 9 (action_0): reward=0


In [6]:
# Simulate action values (Q): expected reward for each action
pulls = 100000

action_values = []
for reward in rewards:
    value = [reward() for _ in range(pulls)]  # execute each of the actions 'pulls' times
    action_values.append(value)
    
    
for action, value in enumerate(action_values):
    print("Action %d: Q(a_%d)=%.2f" % (action, action, np.mean(value)))

Action 0: Q(a_0)=0.50
Action 1: Q(a_1)=0.60
Action 2: Q(a_2)=0.20


In [44]:
# To simulate the values (V), we need to define a policy
# (Value is the expected reward given the policy I'm following)

# Define a policy:
def policy_random():
    '''Returns which action to perform using equal probabilities for each action'''
    return np.random.choice([0, 1, 2], p=[1/3, 1/3, 1/3])


def policy_better():
    ''' A better policy than random: we choose actions 0 and 1 more often than action 2'''
    return np.random.choice([0, 1, 2], p=[0.4, 0.5, 0.1])

In [45]:
# Simulate Values using the random policy
total_reward = 0
for pull in range(pulls):
    action = policy_random()
    total_reward += rewards[action]()
print("Total reward =", total_reward)
print("Average reward: V =", total_reward/pulls)

Total reward = 43510
Average reward: V = 0.4351


In [46]:

# Simulate Values using the better policy
total_reward = 0
for pull in range(pulls):
    action = policy_better()
    total_reward += rewards[action]()
print("Total reward =", total_reward)
print("Average reward: V =", total_reward/pulls)

Total reward = 52090
Average reward: V = 0.5209


In [47]:
# Regret of the better policy
V_star = max([np.mean(value) for value in action_values])
print("V* =", V_star)

total_regret = 0
for pull in range(pulls):
    total_regret += (V_star - rewards[policy_better()]())
print('Regret: I_t = %.2f' % (total_regret/pulls))

V* = 0.60133
Regret: I_t = 0.08


In [48]:
# Some bandit policies to explore:

def policy_greedy(action_values):
    '''Always returns the action for which the payoff is highest'''
    best_action = np.argmax([np.mean(value) for value in action_values])
    return best_action


def policy_e_greedy(action_values, epsilon=0.05):
    '''We explore with epsilon probability, and choose the best action the rest of the time'''
    explore = np.random.choice([1, 0], p=[epsilon, 1-epsilon])
    if explore:
        # Random action
        return policy_random()
    else:
        # Choose best action
        return policy_greedy(action_values)


In [49]:

# Implementing the decaying epsilon-greedy properly requires a class definition so we can store the epsilon values
class DecayingEGreedy:
    
    def __init__(self, epsilon, decay=0.99, lower_bound=0):
        self.epsilon = epsilon
        self.decay = decay
        self.lower_bound = lower_bound
        
    def policy(self, action_values):
        if self.lower_bound > 0 and self.epsilon > self.lower_bound:
            self.epsilon *= self.decay  # update epsilon
        explore = np.random.choice([1, 0], p=[self.epsilon, 1-self.epsilon])  # explore vs exploit decision
        if explore:
            # Random action
            return policy_random()
        else:
            # Choose best action
            return policy_greedy(action_values)

In [50]:
# Let's test the decaying epsilon-greedy approach
agent = DecayingEGreedy(epsilon=0.1, decay=0.99, lower_bound=0.03)

# Full problem:
action_values = [[], [], []] # initialise values
rewards_decaying_e_greedy = []
total_reward = 0
print('Number of pulls\t\tTotal reward\t\tV')
for pull in range(pulls):
    action = agent.policy(action_values)  # choose action according to policy
    reward = rewards[action]()  # get reward
    action_values[action].append(reward)  # update action_values so we make better decisions down the line
    total_reward += reward
    if (pull+1) % 1000 == 0:
        print('%d\t\t\t%d\t\t\t%.3f' % (pull+1, total_reward, total_reward/pull))
        rewards_decaying_e_greedy.append(total_reward/pull)

Number of pulls		Total reward		V


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


1000			500			0.501
2000			981			0.491
3000			1528			0.510
4000			2117			0.529
5000			2708			0.542
6000			3310			0.552
7000			3921			0.560
8000			4497			0.562
9000			5096			0.566
10000			5684			0.568
11000			6286			0.572
12000			6863			0.572
13000			7470			0.575
14000			8071			0.577
15000			8667			0.578
16000			9295			0.581
17000			9862			0.580
18000			10476			0.582
19000			11067			0.583
20000			11653			0.583
21000			12241			0.583
22000			12851			0.584
23000			13433			0.584
24000			14033			0.585
25000			14620			0.585
26000			15216			0.585
27000			15826			0.586
28000			16419			0.586
29000			16989			0.586
30000			17580			0.586
31000			18166			0.586
32000			18760			0.586
33000			19358			0.587
34000			19942			0.587
35000			20549			0.587
36000			21179			0.588
37000			21749			0.588
38000			22347			0.588
39000			22926			0.588
40000			23535			0.588
41000			24120			0.588
42000			24716			0.588
43000			25325			0.589
44000			25936			0.589
45000			26513			0.589
46000			27080			0.589
47000			27678			