### Compare Listing 
<li>a: vector uniform</li>
<li>b: greedy</li>
<li>c: e - greedy</li>
<li>d: Linear Reward Penalty</li>
<li>e: Linear Reward Inaction</li>
<li>f: UBC</li>
<li>g: BayesianUCB</li>
<li>h: Thompson Sampling</li>
<li>i: Neural Network</li>
<li>j: Non Stationary</li>

In [39]:
# import lib
import numpy as np
import matplotlib.pyplot as plt
import scipy,time,sys
np.random.seed(5678)
np.set_printoptions(3)

In [40]:
# setting the ground truth
num_bandit = 12
num_ep  = 10
num_iter= 2000
gt_prob = np.random.uniform(0,1,num_bandit)
optimal_choice = np.argmax(gt_prob)
print(gt_prob)
print('Best Choice: ',optimal_choice,gt_prob[optimal_choice])

[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Best Choice:  11 0.7364685816073836


In [54]:
# vectorized
a_expect = np.zeros((num_ep,num_bandit))
                    
for eps in range(num_ep):
    temp_expect = np.zeros(num_bandit)
    temp_choice = np.zeros(num_bandit)
                    
    for iter in range(num_iter//10):
        temp_choice    = temp_choice + 1
        current_reward = np.random.uniform(0,1) < gt_prob
        temp_expect    = temp_expect + (1/(temp_choice+1)) * (current_reward - temp_expect)
    a_expect[eps,:] = temp_expect
                    
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(a_expect.mean(0))

Ground Truth
[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Expected 
[0.496 0.059 0.369 0.527 0.61  0.435 0.176 0.279 0.072 0.181 0.093 0.738]


In [None]:
# greedy
b_reward       = np.zeros((num_ep,num_iter))
b_optimal_pull = np.zeros((num_ep,num_iter))
b_pull_count   = np.zeros((num_ep,num_bandit))
b_estimation   = np.zeros((num_ep,num_bandit))
                    
for eps in range(num_ep):
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit)
                    
    for iter in range(num_iter):
        
        # select bandit / get reward /increase count / update estimate
        current_choice = np.argmax(temp_expect)
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
        temp_estimation[current_choice] = temp_estimation[current_choice] + (1/(temp_pull_count[current_choice]+1)) * (current_reward-temp_estimation[current_choice])
        
        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
    a_expect[eps,:] = temp_expect
                    
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(a_expect.mean(0))

# Reference 
1. numpy.set_printoptions — NumPy v1.14 Manual. (2019). Docs.scipy.org. Retrieved 13 January 2019, from https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.set_printoptions.html