# $k$-armed Bandit

## Notation
$k:$ number of actions  <br>
$t:$ discrete time step <br>
$a:$ action <br>
$A_t:$ action at time t <br>
$q_*(a)$: true value (expected reward) of action a  <br>
$Q_t(a):$ estimate at t of $q_*(a)$ <br>
$N_t(a):$ number of times action a was performed (up to time t) <br>
$R_t(a):$ reward at time t

In [325]:
# Imports
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn')

In [316]:
# Inits
def k_arms_init(k):
    Q_t = {'{}'.format(k+1): 0 for k in range(k)}
    N_t = {'{}'.format(k+1): 0 for k in range(k)}
    return Q_t, N_t 

def random_action(epsilon):
    return (np.random.random() < epsilon)
def sucess(probability):
    return (np.random.random() < probability)
    

In [327]:
# Interactive k-armed bandit plot
@interact(k = (1,10,1), N = (1,10000,100), epsilon = (0.01,0.5,0.05))
def k_armed_bandit(k,N,epsilon):
    np.random.seed(123)
    Q_t, N_t = k_arms_init(k)
    sucess_prob = [0.10, 0.50, 0.60, 0.80, 0.10, 0.25, 0.60, 0.45, 0.75, 0.65]

    for i in range(N):
        if random_action(epsilon) is True: #  Choose a random action
            idx = int(np.random.randint(low=1,high=k+1,size=1))

        else:  #  Choose the action with max reward
            idx = int(max(Q_t, key=Q_t.get))

        N_t['{}'.format(idx)] += 1
        Q_t['{}'.format(idx)] += (1/N_t['{}'.format(idx)])*(1*sucess(sucess_prob[idx-1])-Q_t['{}'.format(idx)])

    for key,value in zip(Q_t.keys(), Q_t.values()):
        plt.scatter(key,value, c='k')
    plt.scatter(range(k),sucess_prob[:k], marker='x', label='True success Pr')
    plt.xlabel('Actions')
    plt.ylabel('Reward')
    plt.ylim(0,1)
    plt.legend()
    plt.show()        


interactive(children=(IntSlider(value=5, description='k', max=10, min=1), IntSlider(value=4901, description='N…