In [None]:
import numpy as np
from matplotlib import pyplot as plt

In [None]:
# Basen för bandit klassen
class Bandit(object):
    def __init__(self, numArms, trueRewards, epsilon, mode):
        self.Q = [0 for i in range(numArms)]
        self.N = [0 for i in range(numArms)]
        self.numArms = numArms
        self.epsilon = epsilon
        self.trueRewards = trueRewards
        self.lastAction = None
        self.mode = mode
        
    def pull(self):
        rand = np.random.random()                            # uniform distributed number
        if rand <= self.epsilon: 
            whichArm = np.random.choice(self.numArms)        # compare a random number with epsilon , if smaller we take random action 
        elif rand > self.epsilon:
            a = np.array([approx for approx in self.Q])      # otherwise we take the argmax of the estimates of the wewards
            whichArm = np.random.choice(np.where(a == a.max())[0]) # we create a list with the max values, if we get more than one then we choose randomly
        
        self.lastAction = whichArm                                 # we keep the value of the arm         
        return np.random.randn() + self.trueRewards[whichArm]      # we return a normal distribution reward centered in the true reward for that action

    
    def updateMean(self,sample):
        whichArm = self.lastAction
        self.N[whichArm] += 1  
    
        if self.mode == 'sample-average':
            self.Q[whichArm] = self.Q[whichArm] + 1.0/self.N[whichArm] * (sample - self.Q[whichArm])
        elif self.mode == 'constant':
            self.Q[whichArm] = self.Q[whichArm] + 0.1 * (sample - self.Q[whichArm])  # 0.1 is alpha
            
            
def simulate(numArms, epsilon, numPulls, mode):
    rewardHistory = np.zeros(numPulls)
    for j in range(2000):
        if j % 200 == 0:
            print(j)
        rewards = [np.random.randn() for _ in range(numActions)]
        bandit = Bandit(numArms, rewards, epsilon, mode)
        for i in range(numPulls):
            reward = bandit.pull()
            bandit.updateMean(reward)
            rewardHistory[i] += reward
    average = rewardHistory / 2000
    return average

In [None]:
numActions = 10
run1 = simulate(numActions, epsilon=0.1, numPulls=1000, mode='sample-average')
run2 = simulate(numActions, epsilon=0.1, numPulls=1000, mode='constant')
plt.plot(run1, 'b--', run2, 'r--')
plt.legend(['sample-average', 'constant alpha'])
plt.show()