In [2]:
import numpy as np
import matplotlib as plt

In [3]:
class MABEnvironment:
    
    # Initializes the reward distribution, number of arms and list of parameters
    def __init__(self, reward_dist:str ="Bernoulli", n_arms:int =5,
                 param_list:np.ndarray[int]=np.array([0.1, 0.2, 0.3, 0.4, 0.5])):
        self.reward_dist = reward_dist
        self.n_arms = 5
        self.param_list = param_list
    
    # Takes one step by pulling the specified arm, returns the stochastic reward
    def step(self, arm):
        assert(arm in list(range(self.n_arms)), "Invalid arm {}, only {} arms".format(arm, self.n_arms))
        
        # Bernoulli rewards implemented
        if(self.reward_dist == "Bernoulli"):
            reward = np.random.binomial(1, self.param_list[arm])
            
        return reward
    
    # Returns an array of regret values 
    def run_alg_regret(alg, steps = 30):
        
        if(reward_dist == 'Bernoulli'):
            best_arm = np.argmax(self.param_list)
            best_mean_reward = self.param_list[best_arm]
            
        reward_array = np.array([])
        arm = np.random.choice(np.arange(0,self.n_arms))
        arm_array = np.array([arm])
            
        for i in range(steps):
            reward = self.step(arm)
            reward_array = np.append(reward_array, reward)
            next_arm = alg(reward_array, arm_array, steps, n_arms)
            arm_array = np.append(arm_array, next_arm)
        
        regret_array = np.array([(steps*best_mean_reward - np.sum(reward_array[:i])) for i in range(1,steps+1)])
        
        return regret_array

  assert(arm in list(range(self.n_arms)), "Invalid arm {}, only {} arms".format(arm, self.n_arms))


In [8]:
class OnlineMABAlgorithm:
    
    def __init__(steps:int, n_arms:int):
        self.steps = steps
        self.n_arms = n_arms
        
    def choose_arm(step_num):
        raise NotImplementedError

    def update(arm, reward, step_num):
        raise NotImplementedError

In [None]:
class ETC(OnlineMABAlgorithm):
    
    def __init__(steps:int, n_arms:int, explore_ct:int):
        super().__init__(steps,n_arms)
        self.explore_ct = explore_ct
        self.etc_reward_list = np.zeros(n_arms)
        self.arm_count = np.zeros(n_arms)
        
    def choose_arm(step_num):
        if(step_num <= self.n_arms*self.explore_ct):
            arm = step_num%self.n_arms
        else:
            arm = np.argmax(self.etc_reward_list)
        return arm
        
    def update(arm, reward, step_num):
        
            

In [None]:
class UCB(OnlineMABAlgorithm):
    
    def __init__(self, steps:int, n_arms:int):
        super().__init__(steps, n_arms)
        self.ucb_reward_list = np.ones(n_arms)*np.inf
        self.arm_count = np.zeros(n_arms)
    
    def update(arm, reward, step_num):
        self.arm_count[arm] +=1
        if(self.ucb_reward_list[arm] == np.inf):
            
        

In [5]:
a = np.inf

In [7]:
np.inf +1

inf