# Contextual Multi-armed bandit algorithms

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import math

random.seed(0)

In [None]:
def ind_max(temp):
    m = max(temp)
    list_m = list(filter(lambda x: temp[x] == m, range(len(temp))))
    return random.choice(list_m)

In [None]:
def generate_reward(arm, x, true_theta):
    p = true_theta.dot(x) + np.random.normal(scale=0.01)
    return p[0]

# Algorithms
## Lin UCB

In [None]:
def lin_UCB(alpha,n_simulation, n_trials,n_arms, n_features):
    n_simulation = n_simulation
    n_trials = n_trials
    n_arms = n_arms
    n_features = n_features
    
    # making empty storage for writing result of simulation
    sim_nums = [0.0 for i in range(n_simulation * n_trials)]
    times = [0.0 for i in range(n_simulation * n_trials)]
    chosen_arms = [0.0 for i in range(n_simulation * n_trials)]
    rewards = [0.0 for i in range(n_simulation * n_trials)]
    cumulative_rewards = [0.0 for i in range(n_simulation * n_trials)]
    
    true_theta = np.array([[ 0.34, -0.12, -0.16,  0.32,  0.03,  0.19, -0.31, -0.70, 0.62,  0.19]])
    
    for sim in range(n_simulation):
        sim = sim + 1
        
        X = np.array([[np.random.uniform(size = n_features) for _ in np.arange(n_arms)] for _ in np.arange(n_trials)])
        A = np.identity(n_features) 
        b = np.zeros(shape=(n_features,1))    
        
        for t in range(n_trials):
            index = (sim - 1) * n_trials + t
            sim_nums[index] = sim
            times[index] = t+1
            
            # select arm
            p = [0 for i in range(n_arms)]
            inv_A = np.linalg.inv(A)
            theta = inv_A.dot(b.reshape(n_features,))
            
            if t%100 == 99:
                print('simulation number : ', sim)
                print(t,' : theta : ', np.round(theta,3))
                
            for a in range(n_arms):
                mean = theta.dot(X[t, a])
                p[a] = mean + alpha * np.sqrt(X[t, a].dot(inv_A).dot(X[t, a]))
            chosen_arm = ind_max(p)
            context_t = X[t, chosen_arm]

            # get the reward
            reward = generate_reward(arm=0, x = context_t, true_theta=true_theta) 
            rewards[index] = reward
            
            if t == 0:
                cumulative_rewards[index] = reward
            else:
                cumulative_rewards[index] = cumulative_rewards[index - 1] + reward
            chosen_arms[index] = chosen_arm
            
            context_t = np.reshape(context_t, (-1, 1))
            A += context_t.dot(context_t.T)
            b += reward*context_t

    temp_result =  [sim_nums, times, chosen_arms, rewards, cumulative_rewards]
    result = pd.DataFrame(temp_result).transpose()
    result.columns = ["Sim", "T", "ChosenArm", "Reward", "CumulativeReward"]
    return result

## Thmopson Sampling with linear pay off

In [None]:
def lin_TS(delta, R,EPSILON, n_simulation,n_trials, n_arms,n_features ):
    n_simulation = n_simulation
    n_trials = n_trials
    n_arms = n_arms
    n_features = n_features
    v = R * np.sqrt(24/EPSILON*d*np.log(1/ delta)) 
    
    # making empty storage for writing result of simulation
    sim_nums = [0.0 for i in range(n_simulation * n_trials)]
    times = [0.0 for i in range(n_simulation * n_trials)]
    chosen_arms = [0.0 for i in range(n_simulation * n_trials)]
    rewards = [0.0 for i in range(n_simulation * n_trials)]
    cumulative_rewards = [0.0 for i in range(n_simulation * n_trials)]
    
    true_theta = np.array([[ 0.34, -0.12, -0.16,  0.32,  0.03,  0.19, -0.31, -0.70, 0.62,  0.19]])
    
    for sim in range(n_simulation):
        sim = sim + 1
        
        X = np.array([[np.random.uniform(size = n_features) for _ in np.arange(n_arms)] for _ in np.arange(n_trials)])
        A = np.identity(n_features) 
        b = np.zeros(shape=(n_features,1))    
        
        for t in range(n_trials):
            index = (sim - 1) * n_trials + t
            sim_nums[index] = sim
            times[index] = t+1
            
            # select arm
            inv_A = np.linalg.inv(A)
            mean = np.dot(inv_A, b).flatten()
            cov = v**2 * inv_A
            theta = np.random.multivariate_normal(mean, cov).reshape((n_features,))
            
            if t%100 == 99:
                print('simulation number : ', sim)
                print(t,': theta : ',np.round(theta,3))
                
            p = [0 for i in range(n_arms)]
            for a in range(n_arms):
                p[a] = theta.dot(X[t, a])      
            chosen_arm = ind_max(p)
            context_t = X[t, chosen_arm]
            
            # get the reward
            reward = generate_reward(arm=0, x = context_t, true_theta=true_theta) 
            rewards[index] = reward
            
            if t == 0:
                cumulative_rewards[index] = reward
            else:
                cumulative_rewards[index] = cumulative_rewards[index - 1] + reward
            chosen_arms[index] = chosen_arm
            
            context_t = np.reshape(context_t, (-1, 1))
            A += context_t.dot(context_t.T)
            b += reward*context_t

    temp_result =  [sim_nums, times, chosen_arms, rewards, cumulative_rewards]
    result = pd.DataFrame(temp_result).transpose()
    result.columns = ["Sim", "T", "ChosenArm", "Reward", "CumulativeReward"]
    return result