### Compare Listing 
<li>a: vector uniform</li>
<li>b: greedy</li>
<li>c: e - greedy</li>
<li>d: decay e - greedy</li>
<li>e: Linear Reward Inaction</li>
<li>f: Linear Reward Penalty</li>
<li>g: UBC</li>
<li>h: BayesianUCB</li>
<li>i: Thompson Sampling</li>
<li>j: Neural Network</li>
<li>k: Non Stationary</li>

In [120]:
# import lib
import numpy as np
import matplotlib.pyplot as plt
import scipy,time,sys
import scipy.stats as stats
np.random.seed(5678)
np.set_printoptions(3)

In [121]:
# setting the ground truth
num_bandit = 12
num_ep  = 20
num_iter= 2000
gt_prob = np.random.uniform(0,1,num_bandit)
optimal_choice = np.argmax(gt_prob)
print(gt_prob)
print('Best Choice: ',optimal_choice,gt_prob[optimal_choice])

[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Best Choice:  11 0.7364685816073836


In [122]:
# a vectorized
a_expect = np.zeros((num_ep,num_bandit))
                    
for eps in range(num_ep):
    temp_expect = np.zeros(num_bandit)
    temp_choice = np.zeros(num_bandit)
                    
    for iter in range(num_iter//10):
        temp_choice    = temp_choice + 1
        current_reward = np.random.uniform(0,1) < gt_prob
        temp_expect    = temp_expect + (1/(temp_choice+1)) * (current_reward - temp_expect)
    a_expect[eps,:] = temp_expect
                    
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(a_expect.mean(0))

Ground Truth
[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Expected 
[0.489 0.056 0.37  0.519 0.598 0.434 0.176 0.284 0.069 0.182 0.085 0.734]


In [123]:
# b greedy
b_pull_count   = np.zeros((num_ep,num_bandit))
b_estimation   = np.zeros((num_ep,num_bandit))
b_reward       = np.zeros((num_ep,num_iter))
b_optimal_pull = np.zeros((num_ep,num_iter))
                    
for eps in range(num_ep):
    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit)
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
                    
    for iter in range(num_iter):
        
        # select bandit / get reward /increase count / update estimate
        current_choice = np.argmax(temp_expect)
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
        temp_estimation[current_choice] = temp_estimation[current_choice] + (1/(temp_pull_count[current_choice]+1)) * (current_reward-temp_estimation[current_choice])
        
        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
    b_pull_count[eps,:]   = temp_pull_count
    b_estimation[eps,:]   = temp_estimation
    b_reward[eps,:]       = temp_reward
    b_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(b_estimation.mean(0))

Ground Truth
[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Expected 
[0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.739]


In [124]:
# c e greedy 
c_pull_count   = np.zeros((num_ep,num_bandit))
c_estimation   = np.zeros((num_ep,num_bandit))
c_reward       = np.zeros((num_ep,num_iter))
c_optimal_pull = np.zeros((num_ep,num_iter))
                    
for eps in range(num_ep):
    epsilon = np.random.uniform(0,1)
    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit)
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
                    
    for iter in range(num_iter):
        
        # select bandit / get reward /increase count / update estimate
        current_choice = np.argmax(temp_expect) if epsilon < np.random.uniform(0,1) else np.random.choice(np.arange(num_bandit))
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
        temp_estimation[current_choice] = temp_estimation[current_choice] + (1/(temp_pull_count[current_choice]+1)) * (current_reward-temp_estimation[current_choice])
        
        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
    c_pull_count[eps,:]   = temp_pull_count
    c_estimation[eps,:]   = temp_estimation
    c_reward[eps,:]       = temp_reward
    c_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(c_estimation.mean(0))

Ground Truth
[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Expected 
[0.489 0.066 0.339 0.524 0.559 0.393 0.175 0.292 0.074 0.198 0.09  0.737]


In [125]:
# d decy e greedy 
d_pull_count   = np.zeros((num_ep,num_bandit))
d_estimation   = np.zeros((num_ep,num_bandit))
d_reward       = np.zeros((num_ep,num_iter))
d_optimal_pull = np.zeros((num_ep,num_iter))
                    
for eps in range(num_ep):
    epsilon = 1.0
    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit)
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
                    
    for iter in range(num_iter):
        
        # select bandit / get reward /increase count / update estimate
        current_choice = np.argmax(temp_expect) if epsilon < np.random.uniform(0,1) else np.random.choice(np.arange(num_bandit))
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
        temp_estimation[current_choice] = temp_estimation[current_choice] + (1/(temp_pull_count[current_choice]+1)) * (current_reward-temp_estimation[current_choice])
        
        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
        # decay the eps
        epsilon = 0.999 * epsilon
        
    d_pull_count[eps,:]   = temp_pull_count
    d_estimation[eps,:]   = temp_estimation
    d_reward[eps,:]       = temp_reward
    d_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(d_estimation.mean(0))

Ground Truth
[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Expected 
[0.472 0.067 0.37  0.515 0.579 0.422 0.183 0.285 0.057 0.188 0.082 0.731]


In [126]:
# e Linear Reward Inaction
e_pull_count   = np.zeros((num_ep,num_bandit))
e_estimation   = np.zeros((num_ep,num_bandit))
e_reward       = np.zeros((num_ep,num_iter))
e_optimal_pull = np.zeros((num_ep,num_iter))
                    
for eps in range(num_ep):
    learning_rate = 0.001
    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit) + 1.0/num_bandit
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
    
    for iter in range(num_iter):
        
        # select bandit / get reward /increase count / update estimate
        current_choice = np.random.choice(num_bandit, p=temp_estimation)
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
        
        mask = np.zeros(num_bandit)
        mask[current_choice] = 1.0
        
        if current_reward == 1.0:
            temp_estimation = (mask) * (temp_estimation + learning_rate * (1-temp_estimation)) + (1-mask) * ( (1-learning_rate) * temp_estimation)
        
        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
    e_pull_count[eps,:]   = temp_pull_count
    e_estimation[eps,:]   = temp_estimation
    e_reward[eps,:]       = temp_reward
    e_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(e_estimation.mean(0))
print('Expected Normalized')
print(e_estimation.mean(0) * gt_prob.sum())

Ground Truth
[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Expected 
[0.104 0.045 0.081 0.108 0.129 0.092 0.055 0.07  0.046 0.057 0.046 0.166]
Expected Normalized
[0.419 0.181 0.325 0.433 0.516 0.369 0.221 0.279 0.184 0.23  0.185 0.665]


In [127]:
# f Linear Reward Penalty
f_pull_count   = np.zeros((num_ep,num_bandit))
f_estimation   = np.zeros((num_ep,num_bandit))
f_reward       = np.zeros((num_ep,num_iter))
f_optimal_pull = np.zeros((num_ep,num_iter))
                    
for eps in range(num_ep):
    alpha = 0.001
    beta  = 0.0001
    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit) + 1.0/num_bandit
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
    
    for iter in range(num_iter):

        # select bandit / get reward /increase count / update estimate
        current_choice = np.random.choice(num_bandit, p=temp_estimation)
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1

        mask = np.zeros(num_bandit)
        mask[current_choice] = 1.0
        
        if current_reward == 1.0:
            temp_estimation = (mask) * (temp_estimation + alpha * (1-temp_estimation)) + (1-mask) * ( (1-alpha) * temp_estimation)
        else: 
            temp_estimation = (mask) * ((1-beta) * temp_estimation) + (1-mask) * ( beta/(num_bandit-1) + (1-beta) * temp_estimation )

        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
    f_pull_count[eps,:]   = temp_pull_count
    f_estimation[eps,:]   = temp_estimation
    f_reward[eps,:]       = temp_reward
    f_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(f_estimation.mean(0))
print('Expected Normalized')
print(f_estimation.mean(0) * gt_prob.sum())

Ground Truth
[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Expected 
[0.105 0.046 0.08  0.109 0.126 0.093 0.055 0.073 0.047 0.057 0.048 0.16 ]
Expected Normalized
[0.422 0.186 0.32  0.435 0.506 0.374 0.222 0.291 0.188 0.228 0.194 0.64 ]


In [None]:
# Thompson Sampling (beta)
k_pull_count   = np.zeros((num_ep,num_bandit))
k_estimation   = np.zeros((num_ep,num_bandit))
k_reward       = np.zeros((num_ep,num_iter))
k_optimal_pull = np.zeros((num_ep,num_iter))
                    
for eps in range(num_ep):

    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit)
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
                    
    for iter in range(num_iter):
        
        theta_samples = [stats.beta(a=1+w,b=1+t-w).rvs(size=1) for t, w in zip(temp_pull_count, temp_estimation)]
        
        # select bandit / get reward /increase count / update estimate
        current_choice = np.argmax(theta_samples)
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
        temp_estimation[current_choice] = temp_estimation[current_choice] + current_reward
        
        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
    k_pull_count[eps,:]   = temp_pull_count
    k_estimation[eps,:]   = theta_samples
    k_reward[eps,:]       = temp_reward
    k_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(k_estimation.mean(0))

0
1
2


# Reference 
1. numpy.set_printoptions — NumPy v1.14 Manual. (2019). Docs.scipy.org. Retrieved 13 January 2019, from https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.set_printoptions.html
2. [ Archived Post ] Random Note about Multi-Arm Bandit Problem 2. (2019). Medium. Retrieved 13 January 2019, from https://medium.com/@SeoJaeDuk/archived-post-random-note-about-multi-arm-bandit-problem-2-5c522d1dfbdc
