In [None]:
# Imports
%matplotlib inline

import sys
import numpy as np
import scipy
import scipy.stats as stats
from scipy.stats import binom
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import math

sns.set_style('darkgrid')
np.random.seed(4)

In [None]:
class IndependentAgent:
    def __init__(self, T, R):
        self.T = T #total amount of iterations
        self.R = R #
        self.n_arms = len(R)
        self.N = np.zeros(self.n_arms) #number of times an arm has been pulled
        self.U = np.ones(self.n_arms) #upper bound for each arm
        self.cum_r = np.zeros(self.n_arms) #summation of all the reward retrieved
        self.hat_R = np.zeros(self.n_arms) #estimation of the reward, simple average
        self.B = np.zeros(self.n_arms) #confidence bound of each arm
        self.ind = np.zeros(self.T) #keep tracks of the indices of the selected arm for each iteration
        self.rewards = np.zeros(self.T) #keep track of the reward for each iteration
        self.R_T = np.zeros(self.T) #keep track of the regret
        self.arms_label = ['arm_'+str(i) for i in range(1,self.n_arms+1)] #arms label used to plot charts
        self.UB = [] #UCB1 upper bound, Auer and Cesa-Bianchi
        
    def compute_upper_bound(self,t):
        delta = np.array( [max(self.R)] *  (self.n_arms)) - np.array(self.R)
        delta = delta[np.nonzero(delta)]
        self.UB.append(8 * sum(1 / delta) * np.log(t+1) + (1 + np.pi**2/3) * sum(delta))
    
    def start(self):
        for t in range(self.T):
            for i in range(self.n_arms):
                self.hat_R[i] = self.cum_r[i]/self.N[i]
                self.B[i] = np.sqrt(2 * np.log(t+1) / self.N[i])
    
            if(t < self.n_arms):
                pulled_arm = t
            else:
                for i in range(self.n_arms):
                    self.U[i] = min(1, self.hat_R[i] + self.B[i])
                pulled_arm = np.argmax(self.U)


            outcome = np.random.binomial(1,self.R[pulled_arm])
            self.rewards[t] = outcome

            #update of the statistics
            self.ind[t] = pulled_arm
            self.N[pulled_arm] += 1
            self.cum_r[pulled_arm] += outcome
            self.R_T[t] = (max(self.R) * (t+1)) - sum(self.cum_r)
            self.compute_upper_bound(t)
    
    def plot_regret(self):
        plt.plot(self.R_T, 'r', label='Pseudo Regret')
        plt.plot(self.UB, 'b', label='UCB1 Upper Bound')
        plt.ylabel('Cumulated Regret')
        plt.xlabel('iterations')
        plt.legend()
        plt.show()
        
    def plot_arm_view(self):
        plt.ylim([-0.2, 1.2])
        plt.xlim([-1, 4])
        plt.scatter(self.arms_label, self.hat_R)
        plt.errorbar(self.arms_label, self.hat_R, yerr=self.B/2, fmt='o', label='estimated reward')
        plt.scatter(self.arms_label, self.R, label='true reward')
        plt.title('Estimated Reward')
        plt.legend()
        plt.show()

In [None]:
agent1 = IndependentAgent(1000, [0.2, 0.3, 0.7, 0.5])
agent1.start()
agent1.plot_regret()
agent1.plot_arm_view()