# Assignement 2 : The Exploration-Exploitation Dilemma

## 1 - Stochastic Multi-Armed Bandits on Simulated Data

### 1.1 Bernoulli bandit models

In [7]:
# Imports 
import numpy as np 
import arms

In [8]:
# Defining our own Bernoulli bandit model with K arms of means p1, . . . , pK

# Random state
rs = np.random.randint(1, 312414)

arm1 = arms.ArmBernoulli(0.50, random_state=rs)
arm2 = arms.ArmBernoulli(0.35, random_state=rs)
arm3 = arms.ArmBernoulli(0.40, random_state=rs)
arm4 = arms.ArmBernoulli(0.55, random_state=rs)

MAB = [arm1, arm2, arm3, arm4]


#### Question 1:

In [22]:
def sampled_reward(arm):
    """Get reward 1 if the arm is a Bernoulli arm 
    and 0 if it isn't
    """
    if isinstance(arm, arms.ArmBernoulli):
        return int(arm.sample())
    else:
        return int(arms.ArmBernoulli(arm))
    

In [27]:
def UCB1(T, MAB, ro=0.25 ):
    """N : number of simulations
    ro : confidence interval parameter 
    """ 
    # nbrA : number of arms
    nbrA = len(MAB)
    # List of the obtained rewards 
    rew = []
    # List of drawn arms 
    draws = []
    # Sum of arms rewards
    sum_rew = [0] * nbrA
    # Number of times each arm has been drawn 
    n_draws = [0] * nbrA
    
    # Initialise first phase : Play each arm once
    for i in range(nbrA):
        reward = sampled_reward(MAB[i])
        n_draws[i] += 1
        sum_rew[i] += reward
        draws.append(i)
        rew.append(reward)
        
    # Other drawings until time T
    for t in range(nbrA, T):
        # optimistic scores of the arms at time t
        optimistic_scores = np.array([sum_rew[a]/n_draws[a] + ro*np.sqrt(np.log(t)/(2*n_draws[a])) 
                           for a in range(nbrA)])
        
        # Pull arm
        # Arm to draw is the arm with the highest score 
        index_arm_draw = np.argmax(optimistic_scores)
        
        reward = sampled_reward(MAB[index_arm_draw])
        n_draws[index_arm_draw] += 1
        sum_rew[index_arm_draw] += reward
        draws.append(index_arm_draw)
        rew.append(reward)
    return rew, draws        

In [28]:
def TS(T,MAB):
    """N : number of simulations
    ro : confidence interval parameter 
    """ 
    # nbrA : number of arms
    nbrA = len(MAB)
    # List of the obtained rewards 
    rew = []
    # List of drawn arms 
    draws = []
    # Sum of arms rewards
    sum_rew = [0] * nbrA
    # Number of times each arm has been drawn 
    n_draws = [0] * nbrA
    
    for t in range(T):
        # posterior distributions
        scores = [np.random.beta(sum_rew[a] + 1, n_draws[a] - sum_rew[a] + 1) 
                  for a in range(nbrA)]
        # Pull arm
        # Arm to draw is the arm with the highest score 
        index_arm_draw = np.argmax(scores)
        
        reward = sampled_reward(MAB[index_arm_draw])
        n_draws[index_arm_draw] += 1
        sum_rew[index_arm_draw] += reward
        draws.append(index_arm_draw)
        rew.append(reward)
    return rew, draws 

In [29]:
"""Simulating a bandit game of length T with the UCB1 and Thompson Sampling
strategy on the bandit model MAB: rew and draws are the sequence of the
T rewards obtained and of the T the arms drawn."""
T = 5000  # horizon

rew1, draws1 = UCB1(T, MAB)
reg1 = mu_max * np.arange(1, T + 1) - np.cumsum(rew1)
rew2, draws2 = TS(T, MAB)
reg2 = mu_max * np.arange(1, T + 1) - np.cumsum(rew2) 

[2,
 2,
 1,
 1,
 0,
 1,
 0,
 3,
 2,
 2,
 0,
 1,
 3,
 3,
 1,
 2,
 0,
 2,
 0,
 0,
 0,
 2,
 3,
 0,
 0,
 2,
 2,
 3,
 0,
 3,
 2,
 3,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 0,
 3,
 3,
 3,
 3,
 3,
 2,
 0,
 2,
 3,
 3,
 0,
 0,
 3,
 3,
 0,
 3,
 3,
 0,
 0,
 3,
 3,
 3,
 3,
 3,
 2,
 3,
 0,
 0,
 3,
 3,
 0,
 3,
 1,
 3,
 3,
 2,
 3,
 3,
 2,
 3,
 3,
 2,
 3,
 2,
 3,
 0,
 3,
 3,
 3,
 3,
 3,
 2,
 3,
 1,
 0,
 3,
 3,
 3,
 0,
 0,
 2,
 0,
 1,
 0,
 3,
 3,
 3,
 1,
 0,
 0,
 0,
 3,
 2,
 3,
 3,
 3,
 3,
 0,
 3,
 0,
 3,
 2,
 0,
 3,
 0,
 3,
 0,
 3,
 0,
 3,
 0,
 2,
 3,
 0,
 3,
 3,
 3,
 0,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 0,
 0,
 3,
 0,
 3,
 0,
 0,
 0,
 2,
 3,
 0,
 0,
 3,
 0,
 3,
 0,
 3,
 3,
 3,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 3,
 0,
 2,
 0,
 0,
 3,
 0,
 3,
 0,
 0,
 0,
 3,
 0,
 3,
 3,
 0,
 0,
 0,
 3,
 3,
 0,
 3,
 0,
 0,
 3,
 3,
 3,
 3,
 2,
 3,
 3,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 3,
 2,
 3,
