### Compare Listing 
<li>a: vector uniform</li>
<li>b: greedy</li>
<li>c: e - greedy</li>
<li>d: decay e - greedy</li>
<li>e: Linear Reward Inaction</li>
<li>f: Linear Reward Penalty</li>
<li>g: UBC 1</li>
<li>h: Bayesian UCB</li>
<li>i: Thompson Sampling (beta)</li>
<li>j: Thompson Sampling (uniform)</li>
<li>k: Neural Network</li>
<li>l: Non Stationary</li>

In [170]:
# import lib
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import scipy,time,sys
import scipy.stats as stats
from scipy.stats import beta
np.random.seed(5678)
np.set_printoptions(3)
tf.set_random_seed(678)
%load_ext jupyternotify

The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify


In [156]:
# setting the ground truth
num_bandit = 12
num_ep  = 20
num_iter= 1000
gt_prob = np.random.uniform(0,1,num_bandit)
optimal_choice = np.argmax(gt_prob)
print(gt_prob)
print('Best Choice: ',optimal_choice,gt_prob[optimal_choice])

[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Best Choice:  11 0.7364685816073836


In [157]:
# a vectorized
a_expect = np.zeros((num_ep,num_bandit))
                    
for eps in range(num_ep):
    temp_expect = np.zeros(num_bandit)
    temp_choice = np.zeros(num_bandit)
                    
    for iter in range(num_iter//10):
        temp_choice    = temp_choice + 1
        current_reward = np.random.uniform(0,1) < gt_prob
        temp_expect    = temp_expect + (1/(temp_choice+1)) * (current_reward - temp_expect)
    a_expect[eps,:] = temp_expect
                    
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(a_expect.mean(0))

Ground Truth
[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Expected 
[0.482 0.055 0.37  0.513 0.586 0.434 0.181 0.283 0.07  0.188 0.085 0.725]


In [158]:
# b greedy
b_pull_count   = np.zeros((num_ep,num_bandit))
b_estimation   = np.zeros((num_ep,num_bandit))
b_reward       = np.zeros((num_ep,num_iter))
b_optimal_pull = np.zeros((num_ep,num_iter))
                    
for eps in range(num_ep):
    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit)
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
                    
    for iter in range(num_iter):
        
        # select bandit / get reward /increase count / update estimate
        current_choice = np.argmax(temp_expect)
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
        temp_estimation[current_choice] = temp_estimation[current_choice] + (1/(temp_pull_count[current_choice]+1)) * (current_reward-temp_estimation[current_choice])
        
        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
    b_pull_count[eps,:]   = temp_pull_count
    b_estimation[eps,:]   = temp_estimation
    b_reward[eps,:]       = temp_reward
    b_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(b_estimation.mean(0))

Ground Truth
[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Expected 
[0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.739]


In [159]:
# c e greedy 
c_pull_count   = np.zeros((num_ep,num_bandit))
c_estimation   = np.zeros((num_ep,num_bandit))
c_reward       = np.zeros((num_ep,num_iter))
c_optimal_pull = np.zeros((num_ep,num_iter))
                    
for eps in range(num_ep):
    epsilon = np.random.uniform(0,1)
    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit)
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
                    
    for iter in range(num_iter):
        
        # select bandit / get reward /increase count / update estimate
        current_choice = np.argmax(temp_expect) if epsilon < np.random.uniform(0,1) else np.random.choice(np.arange(num_bandit))
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
        temp_estimation[current_choice] = temp_estimation[current_choice] + (1/(temp_pull_count[current_choice]+1)) * (current_reward-temp_estimation[current_choice])
        
        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
    c_pull_count[eps,:]   = temp_pull_count
    c_estimation[eps,:]   = temp_estimation
    c_reward[eps,:]       = temp_reward
    c_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(c_estimation.mean(0))

Ground Truth
[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Expected 
[0.471 0.078 0.385 0.489 0.581 0.401 0.172 0.255 0.092 0.162 0.095 0.743]


In [160]:
# d decy e greedy 
d_pull_count   = np.zeros((num_ep,num_bandit))
d_estimation   = np.zeros((num_ep,num_bandit))
d_reward       = np.zeros((num_ep,num_iter))
d_optimal_pull = np.zeros((num_ep,num_iter))
                    
for eps in range(num_ep):
    epsilon = 1.0
    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit)
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
                    
    for iter in range(num_iter):
        
        # select bandit / get reward /increase count / update estimate
        current_choice = np.argmax(temp_expect) if epsilon < np.random.uniform(0,1) else np.random.choice(np.arange(num_bandit))
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
        temp_estimation[current_choice] = temp_estimation[current_choice] + (1/(temp_pull_count[current_choice]+1)) * (current_reward-temp_estimation[current_choice])
        
        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
        # decay the eps
        epsilon = 0.999 * epsilon
        
    d_pull_count[eps,:]   = temp_pull_count
    d_estimation[eps,:]   = temp_estimation
    d_reward[eps,:]       = temp_reward
    d_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(d_estimation.mean(0))

Ground Truth
[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Expected 
[0.471 0.066 0.344 0.512 0.575 0.43  0.183 0.279 0.066 0.179 0.101 0.732]


In [161]:
# e Linear Reward Inaction
e_pull_count   = np.zeros((num_ep,num_bandit))
e_estimation   = np.zeros((num_ep,num_bandit))
e_reward       = np.zeros((num_ep,num_iter))
e_optimal_pull = np.zeros((num_ep,num_iter))
                    
for eps in range(num_ep):
    learning_rate = 0.001
    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit) + 1.0/num_bandit
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
    
    for iter in range(num_iter):
        
        # select bandit / get reward /increase count / update estimate
        current_choice = np.random.choice(num_bandit, p=temp_estimation)
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
        
        mask = np.zeros(num_bandit)
        mask[current_choice] = 1.0
        
        if current_reward == 1.0:
            temp_estimation = (mask) * (temp_estimation + learning_rate * (1-temp_estimation)) + (1-mask) * ( (1-learning_rate) * temp_estimation)
        
        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
    e_pull_count[eps,:]   = temp_pull_count
    e_estimation[eps,:]   = temp_estimation
    e_reward[eps,:]       = temp_reward
    e_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(e_estimation.mean(0))
print('Expected Normalized')
print(e_estimation.mean(0) * gt_prob.sum())

Ground Truth
[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Expected 
[0.095 0.062 0.086 0.098 0.104 0.09  0.069 0.078 0.063 0.07  0.063 0.123]
Expected Normalized
[0.379 0.247 0.343 0.391 0.416 0.359 0.278 0.312 0.251 0.282 0.254 0.494]


In [162]:
# f Linear Reward Penalty
f_pull_count   = np.zeros((num_ep,num_bandit))
f_estimation   = np.zeros((num_ep,num_bandit))
f_reward       = np.zeros((num_ep,num_iter))
f_optimal_pull = np.zeros((num_ep,num_iter))
                    
for eps in range(num_ep):
    alpha = 0.001
    beta  = 0.0001
    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit) + 1.0/num_bandit
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
    
    for iter in range(num_iter):

        # select bandit / get reward /increase count / update estimate
        current_choice = np.random.choice(num_bandit, p=temp_estimation)
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1

        mask = np.zeros(num_bandit)
        mask[current_choice] = 1.0
        
        if current_reward == 1.0:
            temp_estimation = (mask) * (temp_estimation + alpha * (1-temp_estimation)) + (1-mask) * ( (1-alpha) * temp_estimation)
        else: 
            temp_estimation = (mask) * ((1-beta) * temp_estimation) + (1-mask) * ( beta/(num_bandit-1) + (1-beta) * temp_estimation )

        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
    f_pull_count[eps,:]   = temp_pull_count
    f_estimation[eps,:]   = temp_estimation
    f_reward[eps,:]       = temp_reward
    f_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(f_estimation.mean(0))
print('Expected Normalized')
print(f_estimation.mean(0) * gt_prob.sum())

Ground Truth
[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Expected 
[0.093 0.063 0.085 0.099 0.105 0.088 0.071 0.078 0.063 0.07  0.065 0.12 ]
Expected Normalized
[0.374 0.251 0.34  0.396 0.423 0.353 0.283 0.313 0.254 0.28  0.259 0.483]


In [163]:
# g UBC
g_pull_count   = np.zeros((num_ep,num_bandit))
g_estimation   = np.zeros((num_ep,num_bandit))
g_reward       = np.zeros((num_ep,num_iter))
g_optimal_pull = np.zeros((num_ep,num_iter))
                    
for eps in range(num_ep):
    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit)
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
                    
    for iter in range(num_iter):
        
        # select bandit / get reward /increase count / update estimate
        current_choice = np.argmax(temp_expect + np.sqrt(2*np.log(iter+1)/(1+temp_pull_count)))
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
        temp_estimation[current_choice] = temp_estimation[current_choice] + (1/(temp_pull_count[current_choice]+1)) * (current_reward-temp_estimation[current_choice])
        
        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
    g_pull_count[eps,:]   = temp_pull_count
    g_estimation[eps,:]   = temp_estimation
    g_reward[eps,:]       = temp_reward
    g_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(g_estimation.mean(0))

Ground Truth
[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Expected 
[0.49  0.066 0.367 0.531 0.578 0.419 0.141 0.265 0.07  0.178 0.081 0.741]


In [169]:
# h BayesianUCB
h_pull_count   = np.zeros((num_ep,num_bandit))
h_estimation   = np.zeros((num_ep,num_bandit))
h_reward       = np.zeros((num_ep,num_iter))
h_optimal_pull = np.zeros((num_ep,num_iter))
                    
for eps in range(num_ep):
    temp_pull_count   = np.zeros(num_bandit) + 1/num_bandit
    temp_estimation   = np.zeros(num_bandit) + 1/num_bandit
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
    temp_temp_pull_cou= np.zeros(num_bandit)
                    
    for iter in range(num_iter):
        
        theta_samples = [t/(t+w) + stats.beta.std(a=t,b=w) * 3.0 for t, w in zip(temp_pull_count, temp_estimation)]
        
        # select bandit / get reward /increase count / update estimate
        current_choice = np.argmax(theta_samples)
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + current_reward
        temp_estimation[current_choice] = temp_estimation[current_choice] + (1-current_reward)
        temp_temp_pull_cou[current_choice] = temp_temp_pull_cou[current_choice] + 1
        
        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
    h_pull_count[eps,:]   = temp_temp_pull_cou
    h_estimation[eps,:]   = temp_pull_count/(temp_pull_count + temp_estimation)
    h_reward[eps,:]       = temp_reward
    h_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(h_estimation.mean(0))

Ground Truth
[0.489 0.059 0.366 0.519 0.598 0.431 0.179 0.285 0.071 0.185 0.088 0.736]
Expected 
[0.303 0.081 0.168 0.241 0.503 0.147 0.122 0.144 0.077 0.105 0.071 0.4  ]


In [138]:
# Thompson Sampling (beta) (slow)
k_pull_count   = np.zeros((num_ep,num_bandit))
k_estimation   = np.zeros((num_ep,num_bandit))
k_reward       = np.zeros((num_ep,num_iter))
k_optimal_pull = np.zeros((num_ep,num_iter))
                    
for eps in range(num_ep):

    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit)
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
                    
    for iter in range(num_iter):
        
        theta_samples = [stats.beta(a=1+w,b=1+t-w).rvs(size=1) for t, w in zip(temp_pull_count, temp_estimation)]
        
        # select bandit / get reward /increase count / update estimate
        current_choice = np.argmax(theta_samples)
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
        temp_estimation[current_choice] = temp_estimation[current_choice] + current_reward
        
        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
    k_pull_count[eps,:]   = temp_pull_count
    k_estimation[eps,:]   = theta_samples
    k_reward[eps,:]       = temp_reward
    k_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(k_estimation.mean(0))

Ground Truth
[0.442 0.366 0.672 0.019 0.54  0.368 0.829 0.76  0.588 0.582 0.23  0.995]
Expected 
[0.38  0.334 0.53  0.361 0.447 0.399 0.656 0.534 0.554 0.474 0.266 0.994]


In [139]:
# Thompson Sampling (uniform) (slow)
k_pull_count   = np.zeros((num_ep,num_bandit))
k_estimation   = np.zeros((num_ep,num_bandit))
k_reward       = np.zeros((num_ep,num_iter))
k_optimal_pull = np.zeros((num_ep,num_iter))
                    
for eps in range(num_ep):

    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit)
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
                    
    for iter in range(num_iter):
        
        theta_samples = [stats.uniform(w/(t+0.000000001),1-w/(t+0.000000001)).rvs(size=1) for t, w in zip(temp_pull_count, temp_estimation)]
        
        # select bandit / get reward /increase count / update estimate
        current_choice = np.argmax(theta_samples)
        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
        temp_estimation[current_choice] = temp_estimation[current_choice] + current_reward
        
        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
    k_pull_count[eps,:]   = temp_pull_count
    k_estimation[eps,:]   = theta_samples
    k_reward[eps,:]       = temp_reward
    k_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(k_estimation.mean(0))

Ground Truth
[0.442 0.366 0.672 0.019 0.54  0.368 0.829 0.76  0.588 0.582 0.23  0.995]
Expected 
[0.789 0.668 0.834 0.562 0.755 0.621 0.858 0.805 0.748 0.692 0.532 0.998]


In [294]:
# b greedy
z_pull_count   = np.zeros((num_ep,num_bandit))
z_estimation   = np.zeros((num_ep,num_bandit))
z_reward       = np.zeros((num_ep,num_iter))
z_optimal_pull = np.zeros((num_ep,num_iter))
            
def sigmoid(x): return 1/(1+np.exp(-x))
def d_sigmoid(x): return sigmoid(x)*(1-sigmoid(x))

for eps in range(num_ep):
    temp_pull_count   = np.zeros(num_bandit)
    temp_estimation   = np.zeros(num_bandit)
    temp_reward       = np.zeros(num_iter)
    temp_optimal_pull = np.zeros(num_iter)
    
    weights = np.random.randn(num_bandit,1)
    moment  = np.zeros_like(weights); 
    velocity = np.zeros_like(weights);
    
    for iter in range(num_iter):
        
        # select bandit / get reward /increase count / update estimate
        if np.random.uniform(0,1)>0.8:
            current_choice = np.argmax(weights)
            current_input  = np.zeros((1,num_bandit))
            current_input[0,current_choice] = 1
        else:
            current_choice = np.random.choice(np.arange(num_bandit))
            current_input  = np.zeros((1,num_bandit))
            current_input[0,current_choice] = 1

        layer1 = current_input @ weights
        print(layer1)
        layer1a= sigmoid(layer1)

        current_reward = 1 if np.random.uniform(0,1) < gt_prob[current_choice] else 0
        temp_estimation[current_choice] = temp_estimation[current_choice] + current_reward
        temp_pull_count[current_choice] = temp_pull_count[current_choice] + 1
        
        grad3 = (temp_estimation[current_choice]/(temp_pull_count[current_choice]))/layer1a
        grad2 = d_sigmoid(layer1)
        grad1 = current_input
        grad  = grad1.T @ (grad3 * grad2)
        
        moment   = 0.9*moment + (1-0.9) * grad
        velocity = 0.999*velocity + (1-0.999) * grad**2
        moment_hat   = moment/(1-0.9)
        velocity_hat = velocity/(1-0.999)
        weights  = weights - 0.0005 * (moment_hat/(np.sqrt(velocity_hat)+1e-8))

        # update reward and optimal choice
        temp_reward[iter] = temp_reward[iter] + current_reward
        temp_optimal_pull[iter] = 1 if current_choice == optimal_choice else 0
        
    z_pull_count[eps,:]   = temp_pull_count
    z_estimation[eps,:]   = np.squeeze(weights)
    z_reward[eps,:]       = temp_reward
    z_optimal_pull[eps,:] = temp_optimal_pull
        
print('Ground Truth')
print(gt_prob)
print('Expected ')
print(z_estimation.mean(0))

[[-0.41]]
[[0.653]]
[[1.615]]
[[1.615]]
[[1.338]]
[[1.613]]
[[-1.259]]
[[0.188]]
[[1.336]]
[[0.65]]
[[1.611]]
[[1.61]]
[[-0.41]]
[[0.22]]
[[-1.259]]
[[0.471]]
[[1.608]]
[[1.333]]
[[1.333]]
[[-1.259]]
[[-1.26]]
[[0.188]]
[[1.606]]
[[1.276]]
[[1.605]]
[[0.188]]
[[1.604]]
[[1.603]]
[[1.603]]
[[-1.61]]
[[1.273]]
[[0.471]]
[[0.188]]
[[1.272]]
[[1.329]]
[[0.425]]
[[1.328]]
[[0.645]]
[[1.27]]
[[-0.41]]
[[0.423]]
[[1.269]]
[[0.233]]
[[1.597]]
[[0.22]]
[[1.596]]
[[1.325]]
[[0.219]]
[[0.643]]
[[1.324]]
[[1.595]]
[[1.594]]
[[1.594]]
[[1.323]]
[[-0.41]]
[[0.216]]
[[1.264]]
[[0.642]]
[[-0.41]]
[[1.591]]
[[0.233]]
[[1.59]]
[[1.32]]
[[1.589]]
[[-1.266]]
[[-1.61]]
[[-1.61]]
[[1.262]]
[[0.64]]
[[1.587]]
[[1.587]]
[[-1.61]]
[[1.586]]
[[1.586]]
[[1.585]]
[[-1.61]]
[[1.584]]
[[-1.269]]
[[1.583]]
[[1.261]]
[[0.471]]
[[1.581]]
[[1.581]]
[[-1.271]]
[[1.58]]
[[0.418]]
[[0.637]]
[[1.317]]
[[0.637]]
[[0.228]]
[[-1.61]]
[[1.577]]
[[1.576]]
[[1.576]]
[[0.227]]
[[0.471]]
[[1.575]]
[[0.188]]
[[0.634]]
[[1.574]]
[[-

[[-1.188]]
[[-0.259]]
[[-0.079]]
[[1.071]]
[[1.07]]
[[-1.752]]
[[1.069]]
[[1.068]]
[[0.8]]
[[-1.753]]
[[1.066]]
[[0.562]]
[[-1.188]]
[[-0.71]]
[[-0.263]]
[[1.063]]
[[-1.188]]
[[1.062]]
[[0.561]]
[[0.17]]
[[1.061]]
[[0.798]]
[[0.169]]
[[0.797]]
[[-0.079]]
[[1.059]]
[[-0.713]]
[[-0.38]]
[[-1.758]]
[[1.058]]
[[1.057]]
[[-0.715]]
[[-0.079]]
[[-0.079]]
[[-0.717]]
[[0.165]]
[[-0.191]]
[[1.055]]
[[0.557]]
[[-1.188]]
[[1.055]]
[[1.054]]
[[-1.188]]
[[-1.188]]
[[1.054]]
[[1.053]]
[[0.163]]
[[0.792]]
[[0.561]]
[[-0.266]]
[[-0.079]]
[[1.052]]
[[-0.08]]
[[-1.76]]
[[0.791]]
[[-1.761]]
[[-0.195]]
[[-0.267]]
[[-0.722]]
[[1.051]]
[[0.557]]
[[1.051]]
[[1.05]]
[[0.162]]
[[0.161]]
[[-0.723]]
[[-0.38]]
[[0.555]]
[[0.789]]
[[-0.269]]
[[1.049]]
[[0.554]]
[[-0.724]]
[[1.049]]
[[0.553]]
[[1.049]]
[[1.048]]
[[-1.763]]
[[0.551]]
[[1.048]]
[[1.048]]
[[0.787]]
[[0.55]]
[[0.55]]
[[0.554]]
[[1.046]]
[[0.553]]
[[-0.198]]
[[0.786]]
[[1.046]]
[[0.785]]
[[0.785]]
[[-0.2]]
[[-0.38]]
[[-1.764]]
[[0.783]]
[[1.044]]
[[-1.18

[[0.506]]
[[0.491]]
[[0.49]]
[[0.926]]
[[-0.27]]
[[-0.452]]
[[-0.135]]
[[-0.774]]
[[0.926]]
[[-0.135]]
[[-1.251]]
[[-1.251]]
[[-0.271]]
[[-1.82]]
[[-1.252]]
[[-1.82]]
[[0.925]]
[[0.925]]
[[0.925]]
[[0.489]]
[[0.111]]
[[0.925]]
[[-1.253]]
[[-0.335]]
[[-0.335]]
[[0.111]]
[[0.488]]
[[0.738]]
[[-0.281]]
[[0.061]]
[[-0.281]]
[[0.675]]
[[-0.192]]
[[-0.192]]
[[-0.435]]
[[-0.08]]
[[-0.435]]
[[0.241]]
[[-0.435]]
[[0.675]]
[[-0.086]]
[[-1.077]]
[[-0.435]]
[[0.061]]
[[-0.267]]
[[0.06]]
[[0.673]]
[[-0.08]]
[[0.672]]
[[0.671]]
[[-0.281]]
[[-1.077]]
[[-0.267]]
[[-0.083]]
[[-0.282]]
[[-0.283]]
[[-0.084]]
[[0.667]]
[[0.237]]
[[0.666]]
[[0.236]]
[[0.094]]
[[0.054]]
[[-0.27]]
[[-0.192]]
[[0.539]]
[[-0.288]]
[[-0.091]]
[[0.663]]
[[-0.435]]
[[0.663]]
[[0.09]]
[[-0.435]]
[[0.662]]
[[0.661]]
[[0.539]]
[[-0.089]]
[[0.539]]
[[0.232]]
[[-1.077]]
[[0.051]]
[[-0.275]]
[[0.087]]
[[-0.091]]
[[0.086]]
[[-0.192]]
[[-0.192]]
[[-0.092]]
[[0.658]]
[[-0.292]]
[[-1.077]]
[[-0.292]]
[[-0.293]]
[[-0.094]]
[[-0.094]]
[[0.65

[[2.236]]
[[-0.549]]
[[2.235]]
[[2.235]]
[[2.234]]
[[0.658]]
[[-0.102]]
[[2.232]]
[[2.232]]
[[2.231]]
[[-0.549]]
[[0.338]]
[[-0.104]]
[[0.337]]
[[-0.549]]
[[2.228]]
[[2.228]]
[[0.655]]
[[0.655]]
[[-1.117]]
[[-0.549]]
[[-0.549]]
[[2.225]]
[[0.334]]
[[-0.549]]
[[0.653]]
[[0.273]]
[[2.224]]
[[0.272]]
[[2.223]]
[[2.223]]
[[-2.01]]
[[2.222]]
[[2.221]]
[[-1.119]]
[[2.22]]
[[-1.12]]
[[2.219]]
[[-1.192]]
[[0.331]]
[[-1.192]]
[[0.33]]
[[-0.549]]
[[-0.107]]
[[-1.192]]
[[-1.192]]
[[-1.192]]
[[-0.7]]
[[2.215]]
[[-0.549]]
[[2.215]]
[[2.215]]
[[2.214]]
[[-2.01]]
[[-1.192]]
[[2.213]]
[[2.213]]
[[-2.01]]
[[-0.108]]
[[0.65]]
[[-1.192]]
[[-1.192]]
[[2.211]]
[[-0.704]]
[[2.21]]
[[-2.01]]
[[1.155]]
[[2.209]]
[[0.326]]
[[2.208]]
[[-0.109]]
[[-1.091]]
[[0.326]]
[[0.325]]
[[-2.01]]
[[-2.01]]
[[0.266]]
[[2.206]]
[[0.324]]
[[2.206]]
[[0.649]]
[[-1.125]]
[[-0.549]]
[[-1.125]]
[[-2.01]]
[[2.204]]
[[2.204]]
[[-0.111]]
[[-1.192]]
[[0.321]]
[[0.264]]
[[0.648]]
[[-0.709]]
[[0.32]]
[[-1.194]]
[[2.202]]
[[-1.128]]
[[0

[[-0.757]]
[[-0.757]]
[[2.024]]
[[0.746]]
[[-0.757]]
[[2.023]]
[[-0.502]]
[[0.412]]
[[-0.254]]
[[2.023]]
[[2.022]]
[[-0.783]]
[[-1.178]]
[[0.412]]
[[0.409]]
[[0.637]]
[[2.021]]
[[0.411]]
[[0.636]]
[[0.026]]
[[0.004]]
[[-0.256]]
[[0.406]]
[[0.003]]
[[-0.784]]
[[0.025]]
[[-0.256]]
[[-0.785]]
[[0.025]]
[[-0.504]]
[[2.02]]
[[2.02]]
[[-0.001]]
[[-1.18]]
[[0.742]]
[[0.024]]
[[2.019]]
[[0.632]]
[[0.402]]
[[-0.258]]
[[0.023]]
[[0.741]]
[[0.408]]
[[-0.788]]
[[0.4]]
[[2.018]]
[[-1.182]]
[[2.018]]
[[2.018]]
[[-0.506]]
[[-0.005]]
[[0.629]]
[[-0.26]]
[[-0.26]]
[[-0.79]]
[[2.016]]
[[0.74]]
[[0.407]]
[[2.016]]
[[-0.507]]
[[-0.508]]
[[-1.184]]
[[0.396]]
[[2.015]]
[[2.015]]
[[0.406]]
[[2.015]]
[[-0.792]]
[[-0.009]]
[[2.014]]
[[2.014]]
[[-0.262]]
[[0.394]]
[[-0.757]]
[[2.013]]
[[0.404]]
[[-0.511]]
[[-0.263]]
[[0.393]]
[[0.021]]
[[-0.793]]
[[-0.757]]
[[0.626]]
[[-0.011]]
[[2.012]]
[[-0.512]]
[[0.625]]
[[-0.012]]
[[-0.012]]
[[2.011]]
[[2.011]]
[[-0.757]]
[[2.011]]
[[-0.014]]
[[-0.015]]
[[-0.015]]
[[0.623]

[[0.92]]
[[1.245]]
[[0.079]]
[[1.245]]
[[0.522]]
[[0.92]]
[[1.794]]
[[-0.541]]
[[0.077]]
[[-0.824]]
[[1.794]]
[[0.013]]
[[1.794]]
[[-0.358]]
[[0.506]]
[[1.793]]
[[0.076]]
[[0.506]]
[[0.506]]
[[0.505]]
[[-0.824]]
[[1.792]]
[[0.749]]
[[1.792]]
[[1.792]]
[[1.791]]
[[1.791]]
[[-0.825]]
[[1.79]]
[[-0.7]]
[[1.79]]
[[-0.542]]
[[0.748]]
[[-0.7]]
[[1.789]]
[[1.243]]
[[-0.359]]
[[-0.701]]
[[1.788]]
[[0.918]]
[[-0.542]]
[[-0.542]]
[[1.787]]
[[1.787]]
[[-0.36]]
[[0.074]]
[[0.012]]
[[-0.703]]
[[0.502]]
[[1.786]]
[[-0.361]]
[[0.917]]
[[0.502]]
[[-0.362]]
[[0.502]]
[[0.073]]
[[1.785]]
[[1.784]]
[[0.501]]
[[1.784]]
[[1.784]]
[[-0.364]]
[[0.073]]
[[-0.827]]
[[1.783]]
[[-0.364]]
[[0.072]]
[[0.52]]
[[0.01]]
[[1.782]]
[[0.52]]
[[1.782]]
[[0.071]]
[[0.747]]
[[1.781]]
[[-0.366]]
[[1.781]]
[[-0.705]]
[[0.915]]
[[-0.705]]
[[1.78]]
[[0.07]]
[[0.518]]
[[0.009]]
[[-0.368]]
[[0.746]]
[[0.498]]
[[-0.544]]
[[-0.706]]
[[1.779]]
[[1.778]]
[[0.008]]
[[0.069]]
[[-0.369]]
[[0.517]]
[[0.498]]
[[0.745]]
[[0.068]]
[[1.242]

[[1.141]]
[[1.141]]
[[-0.552]]
[[0.459]]
[[-0.796]]
[[1.141]]
[[-1.1]]
[[0.98]]
[[-0.796]]
[[1.14]]
[[1.14]]
[[-2.064]]
[[0.451]]
[[0.238]]
[[0.459]]
[[1.139]]
[[1.139]]
[[0.45]]
[[0.459]]
[[-0.686]]
[[-0.796]]
[[0.458]]
[[1.138]]
[[1.138]]
[[-0.686]]
[[1.137]]
[[0.98]]
[[-1.101]]
[[-1.101]]
[[1.137]]
[[-2.065]]
[[1.136]]
[[0.309]]
[[0.449]]
[[1.136]]
[[0.449]]
[[0.237]]
[[1.135]]
[[1.135]]
[[-2.066]]
[[1.134]]
[[0.458]]
[[0.447]]
[[0.308]]
[[0.308]]
[[0.236]]
[[-0.696]]
[[-1.102]]
[[-0.554]]
[[-0.696]]
[[1.133]]
[[1.133]]
[[0.98]]
[[0.98]]
[[0.98]]
[[-2.067]]
[[0.307]]
[[-1.103]]
[[0.306]]
[[1.132]]
[[1.132]]
[[-0.796]]
[[0.306]]
[[-2.067]]
[[0.235]]
[[1.131]]
[[-0.688]]
[[0.305]]
[[-0.555]]
[[0.235]]
[[0.457]]
[[1.13]]
[[0.98]]
[[0.304]]
[[0.457]]
[[-0.688]]
[[-0.689]]
[[1.13]]
[[1.129]]
[[1.129]]
[[-0.796]]
[[-1.105]]
[[1.129]]
[[1.129]]
[[0.302]]
[[-0.69]]
[[0.234]]
[[0.234]]
[[1.128]]
[[-0.69]]
[[0.98]]
[[-0.555]]
[[1.127]]
[[-0.796]]
[[-0.796]]
[[0.457]]
[[-1.105]]
[[1.126]]
[[-0

[[1.255]]
[[-0.162]]
[[1.309]]
[[-0.939]]
[[0.009]]
[[1.255]]
[[-1.142]]
[[-0.234]]
[[1.308]]
[[0.138]]
[[0.009]]
[[0.138]]
[[-0.234]]
[[1.307]]
[[1.307]]
[[1.255]]
[[-0.94]]
[[1.307]]
[[0.95]]
[[-0.941]]
[[0.95]]
[[-1.483]]
[[-1.483]]
[[-0.236]]
[[0.008]]
[[1.306]]
[[0.008]]
[[0.225]]
[[-1.144]]
[[-0.942]]
[[1.305]]
[[1.255]]
[[0.225]]
[[-0.163]]
[[1.305]]
[[-1.483]]
[[-0.668]]
[[1.304]]
[[1.304]]
[[0.224]]
[[-1.144]]
[[-0.237]]
[[0.949]]
[[0.136]]
[[-1.484]]
[[1.303]]
[[-0.164]]
[[1.303]]
[[1.303]]
[[1.302]]
[[0.948]]
[[0.948]]
[[0.135]]
[[-0.238]]
[[1.301]]
[[-0.238]]
[[1.301]]
[[-1.145]]
[[-1.485]]
[[-0.165]]
[[-1.485]]
[[-0.239]]
[[1.3]]
[[-0.165]]
[[-0.239]]
[[1.299]]
[[-0.24]]
[[0.007]]
[[1.299]]
[[1.299]]
[[1.299]]
[[-0.241]]
[[-1.487]]
[[0.946]]
[[1.298]]
[[1.298]]
[[1.255]]
[[-0.943]]
[[1.297]]
[[1.255]]
[[0.006]]
[[1.255]]
[[1.255]]
[[0.223]]
[[0.134]]
[[-0.242]]
[[0.134]]
[[-0.243]]
[[-0.668]]
[[-1.146]]
[[-1.488]]
[[1.295]]
[[-1.489]]
[[1.295]]
[[-0.668]]
[[1.295]]
[[0.005

[[-1.01]]
[[0.799]]
[[0.259]]
[[-1.01]]
[[-0.306]]
[[0.589]]
[[-0.164]]
[[-0.164]]
[[0.798]]
[[-0.165]]
[[0.798]]
[[0.798]]
[[0.329]]
[[-1.511]]
[[0.329]]
[[-0.613]]
[[-0.307]]
[[0.797]]
[[-0.166]]
[[0.797]]
[[0.257]]
[[-1.185]]
[[0.797]]
[[-1.186]]
[[-1.186]]
[[0.352]]
[[-0.307]]
[[0.257]]
[[-1.187]]
[[-0.614]]
[[0.796]]
[[-0.167]]
[[-0.614]]
[[0.796]]
[[-0.614]]
[[0.256]]
[[0.796]]
[[0.256]]
[[0.351]]
[[-1.189]]
[[0.328]]
[[0.255]]
[[0.795]]
[[0.795]]
[[-0.684]]
[[-0.616]]
[[-1.512]]
[[0.794]]
[[-0.685]]
[[0.35]]
[[-0.685]]
[[0.327]]
[[-1.513]]
[[-1.011]]
[[0.794]]
[[0.588]]
[[-0.686]]
[[-1.012]]
[[-1.19]]
[[0.793]]
[[0.793]]
[[0.254]]
[[0.793]]
[[-0.308]]
[[0.349]]
[[-1.012]]
[[0.793]]
[[-0.168]]
[[-1.191]]
[[-1.013]]
[[0.348]]
[[0.792]]
[[-0.687]]
[[-1.192]]
[[-1.013]]
[[0.792]]
[[0.587]]
[[-1.192]]
[[0.791]]
[[-0.309]]
[[-1.193]]
[[0.587]]
[[0.326]]
[[-1.014]]
[[0.347]]
[[0.347]]
[[-1.515]]
[[-0.618]]
[[0.791]]
[[-1.516]]
[[0.79]]
[[0.79]]
[[-1.516]]
[[0.586]]
[[0.79]]
[[0.586]]
[

[[1.823]]
[[1.823]]
[[0.115]]
[[0.148]]
[[-1.655]]
[[-1.136]]
[[-1.137]]
[[1.333]]
[[1.822]]
[[-0.815]]
[[0.114]]
[[0.359]]
[[0.257]]
[[0.114]]
[[1.332]]
[[-0.816]]
[[1.821]]
[[-1.656]]
[[0.906]]
[[1.331]]
[[1.821]]
[[1.483]]
[[-0.817]]
[[1.821]]
[[0.146]]
[[1.168]]
[[1.168]]
[[-0.817]]
[[1.82]]
[[-1.138]]
[[1.167]]
[[-0.818]]
[[1.82]]
[[0.146]]
[[0.146]]
[[-0.237]]
[[0.532]]
[[0.724]]
[[0.828]]
[[0.724]]
[[0.828]]
[[-0.237]]
[[-0.237]]
[[-0.902]]
[[0.828]]
[[-1.099]]
[[0.827]]
[[0.721]]
[[0.72]]
[[-0.902]]
[[-0.118]]
[[-1.272]]
[[0.532]]
[[-1.272]]
[[-0.687]]
[[-1.083]]
[[0.823]]
[[0.532]]
[[-0.083]]
[[-0.237]]
[[0.821]]
[[0.82]]
[[-1.272]]
[[0.532]]
[[-0.906]]
[[0.716]]
[[-1.272]]
[[-0.687]]
[[0.817]]
[[-1.272]]
[[-1.103]]
[[0.714]]
[[-1.104]]
[[-0.908]]
[[-1.087]]
[[-1.272]]
[[0.532]]
[[-1.088]]
[[-0.237]]
[[0.814]]
[[-0.221]]
[[0.813]]
[[-0.118]]
[[-1.091]]
[[-1.091]]
[[-0.237]]
[[-0.911]]
[[-0.911]]
[[-0.237]]
[[-0.912]]
[[0.811]]
[[0.811]]
[[-0.913]]
[[-1.095]]
[[-1.095]]
[[-1.09

[[-0.644]]
[[-0.194]]
[[0.379]]
[[1.134]]
[[-0.694]]
[[-0.694]]
[[-0.694]]
[[-0.479]]
[[-0.56]]
[[0.07]]
[[1.132]]
[[1.132]]
[[-0.697]]
[[-0.644]]
[[1.131]]
[[0.379]]
[[-0.276]]
[[0.943]]
[[1.13]]
[[0.379]]
[[-0.562]]
[[1.129]]
[[1.128]]
[[1.128]]
[[0.942]]
[[1.127]]
[[-0.481]]
[[-0.198]]
[[-0.481]]
[[-0.481]]
[[-0.199]]
[[-0.199]]
[[-0.701]]
[[-0.702]]
[[1.123]]
[[-0.276]]
[[1.122]]
[[-0.277]]
[[-0.703]]
[[0.065]]
[[1.121]]
[[-0.564]]
[[-0.034]]
[[1.12]]
[[1.12]]
[[1.119]]
[[-1.282]]
[[-0.036]]
[[-0.705]]
[[-0.705]]
[[1.118]]
[[-0.565]]
[[-0.038]]
[[0.938]]
[[-0.644]]
[[1.116]]
[[-0.203]]
[[1.116]]
[[1.115]]
[[-0.203]]
[[-0.041]]
[[-0.204]]
[[-0.644]]
[[0.379]]
[[-0.566]]
[[0.936]]
[[1.113]]
[[-0.485]]
[[-0.707]]
[[0.935]]
[[-0.485]]
[[-0.206]]
[[-1.282]]
[[1.112]]
[[-0.045]]
[[-0.207]]
[[1.111]]
[[-0.644]]
[[1.111]]
[[1.11]]
[[-0.568]]
[[-0.487]]
[[-0.708]]
[[-0.047]]
[[1.109]]
[[-0.488]]
[[0.379]]
[[-0.488]]
[[-0.048]]
[[-0.644]]
[[-0.569]]
[[1.108]]
[[1.107]]
[[1.107]]
[[0.379]]
[[

[[-2.296]]
[[-0.064]]
[[-0.379]]
[[-2.409]]
[[-0.289]]
[[-1.027]]
[[-1.238]]
[[-1.027]]
[[-0.289]]
[[-0.064]]
[[-0.064]]
[[-0.065]]
[[-0.29]]
[[-1.283]]
[[-2.41]]
[[-0.065]]
[[-0.065]]
[[-1.239]]
[[-1.239]]
[[-0.291]]
[[-0.291]]
[[-1.283]]
[[-0.118]]
[[-2.297]]
[[-1.24]]
[[-0.065]]
[[-1.028]]
[[-0.118]]
[[-0.292]]
[[-0.118]]
[[-0.801]]
[[-0.801]]
[[-0.293]]
[[-0.119]]
[[-0.066]]
[[-1.241]]
[[-0.12]]
[[-1.285]]
[[-0.066]]
[[-0.38]]
[[-0.746]]
[[-0.746]]
[[-0.067]]
[[-1.242]]
[[-0.067]]
[[-0.067]]
[[-0.121]]
[[-0.067]]
[[-0.067]]
[[-1.242]]
[[-1.243]]
[[-1.243]]
[[-0.068]]
[[-0.295]]
[[-1.244]]
[[-0.068]]
[[-1.244]]
[[-0.295]]
[[-1.029]]
[[-1.029]]
[[-0.381]]
[[-1.029]]
[[-2.298]]
[[-0.069]]
[[-2.298]]
[[-0.069]]
[[-0.803]]
[[-0.382]]
[[-0.746]]
[[-0.069]]
[[-2.411]]
[[-1.286]]
[[-2.299]]
[[-1.286]]
[[-0.07]]
[[-2.411]]
[[-0.07]]
[[-0.804]]
[[-2.299]]
[[-0.069]]
[[-2.3]]
[[-1.03]]
[[-1.247]]
[[-2.3]]
[[-0.071]]
[[-0.07]]
[[-0.07]]
[[-0.07]]
[[-0.07]]
[[-1.287]]
[[-0.383]]
[[-0.072]]
[[-1

[[-0.083]]
[[1.616]]
[[-0.621]]
[[1.615]]
[[-0.335]]
[[0.746]]
[[1.615]]
[[-0.403]]
[[-0.622]]
[[-0.755]]
[[0.96]]
[[-0.335]]
[[0.96]]
[[1.614]]
[[0.825]]
[[-0.336]]
[[0.745]]
[[-0.062]]
[[1.614]]
[[-0.062]]
[[1.614]]
[[1.613]]
[[0.744]]
[[0.619]]
[[-0.756]]
[[0.744]]
[[-0.063]]
[[-0.084]]
[[-0.063]]
[[0.743]]
[[1.31]]
[[-0.622]]
[[0.825]]
[[-0.337]]
[[1.612]]
[[-0.064]]
[[1.612]]
[[-0.085]]
[[-0.065]]
[[1.612]]
[[-0.756]]
[[-0.622]]
[[0.741]]
[[-0.623]]
[[1.309]]
[[-0.404]]
[[1.611]]
[[-0.757]]
[[0.618]]
[[-0.757]]
[[0.958]]
[[-0.757]]
[[0.618]]
[[1.611]]
[[-0.085]]
[[0.824]]
[[-0.438]]
[[-1.147]]
[[-0.161]]
[[-0.797]]
[[2.072]]
[[2.072]]
[[-2.066]]
[[-0.996]]
[[-1.078]]
[[2.072]]
[[-0.996]]
[[-1.078]]
[[-0.996]]
[[2.072]]
[[2.072]]
[[-0.293]]
[[-0.165]]
[[2.072]]
[[-0.165]]
[[2.072]]
[[-0.166]]
[[-1.861]]
[[-0.167]]
[[-0.239]]
[[2.07]]
[[-1.078]]
[[2.068]]
[[2.068]]
[[-0.996]]
[[2.066]]
[[-0.297]]
[[-1.078]]
[[-0.297]]
[[2.064]]
[[-1.152]]
[[-0.239]]
[[-0.299]]
[[-0.438]]
[[-0.797]]


KeyboardInterrupt: 

In [145]:
%%notify 
print('done')

done


<IPython.core.display.Javascript object>

# Reference 
1. numpy.set_printoptions — NumPy v1.14 Manual. (2019). Docs.scipy.org. Retrieved 13 January 2019, from https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.set_printoptions.html
2. [ Archived Post ] Random Note about Multi-Arm Bandit Problem 2. (2019). Medium. Retrieved 13 January 2019, from https://medium.com/@SeoJaeDuk/archived-post-random-note-about-multi-arm-bandit-problem-2-5c522d1dfbdc
