# 1) Import packages

In [11]:
import random as rd
import numpy as np
import os 
import json

# 2) Class Bandit

In [12]:
class bandit:
    
    """ 
    SET INITIAL CONDITIONS
    """
    
    def __init__( self, mu = [0,0] , sigma = [1,1], N = 10 , seed = None ):
        self.mu = mu
        self.sigma = sigma
        self.N = N
        self.seed = seed
        if self.seed == None:
            self.bandits = self.create_bandits(self.mu,self.sigma,self.N)
        else:
            rd.seed( self.seed )
            self.bandits = self.create_bandits(self.mu,self.sigma,self.N)
    
    """ 
    Function to create bandits
    """
    
    def create_bandits(self, mu ,sigma, N ):
        n_bandit = len( mu )
        bandits = []
        for i in range( n_bandit ):
            number = self.rnorm( mu[i] , sigma[i] , N )
            bandits.append(number)
        return bandits
    
    """ 
    Auxilliary functions
    """
    
    def rnorm(self, mu, sigma, N ):
        number = np.random.normal(mu, sigma, N)
        result = number.tolist()
        return result

# 3) Initialize class agent

In [14]:
class agent:
    
    """ 
    SET INITIAL CONDITIONS
    """
    
    def __init__( self, alpha = 0.5, epsilon = 0.01, tau = 0.1 , decision_function = "softmax", \
                  reward_input = None, mu = [0,0] , sigma = [1,1], seed = None, init = None, time = 10, save = False ):
        
        # PARAMETERS
        self.alpha = alpha
        self.epsilon = epsilon
        self.tau = tau
        
        # DECISION FUNCTION
        self.decision_function = decision_function
        
        # BANDITS
        if reward_input == None:
            self.mu = mu
            self.sigma = sigma
            self.reward_set = None
            self.n_bandits = len( self.mu )
            self.seed = seed
        else:  
            self.reward_set = reward_input
            self.n_bandits = len( self.reward_set )
            
        # VALUE FUNCTION, REWARDS AND CHOICE LISTS
        if init == None: 
            self.value_function = self.init_value_container(self.n_bandits)
        else:
            self.value_function = self.init_value_container(self.n_bandits,self.init)
        self.choices = []
        self.rewards = []
        
        # OTHER 
        self.time = time
        self.save = save
    
    
    """ 
    LEARNING FUNCTIONS
    """
    
    # Softmax-Decision - Returns the next choice based on boltzman distrubution
    def softmax(self, value_array, tau):
        # Check for correct input specification 
        if tau == 0:
            print "ERROR: Parameter tau can't be zero!"
            return None
        bandits = range(self.n_bandits)
        numerator = np.exp( np.array( value_array ) ) / float( tau )
        denominator = sum( numerator )
        boltzman_distribution = numerator / denominator
        choice = self.weighted_sample(bandits,probability = boltzman_distribution)
        return choice
    
    # Epsilon-Greedy-Decision - Returns greedy vs. random choice
    def epsilon_greedy(self, value_array, probability ):
        choice_count = len( value_array )
        explore = random_bool( p = probability )
        if explore == 1:
            choice = np.random.choice(choice_count)
            return choice
        else:
            optimal = value_array.argmax()
            return optimal        
        
    """ 
    LEARNING FUNCTION
    """
    
    # Central function to run learning procedure of class agent
    def learn(self, run_time = None ):
         
        # Control run time and initilize bandits 
        if run_time == None:
            run_time = self.time
        if self.reward_set == None:
            raw = bandit( mu = self.mu, sigma = self.sigma, N = run_time, seed = self.seed )
            bandits = raw.bandits
        else:
            bandits = self.reward_set
        if  self.reward_set is not None:
            run_time = len( self.reward_set[0] )
            print "WARNING: Note that run time was fixed to length of bandits"

        # Run learning procedure for option softmax
        if self.decision_function == "softmax":
            
            for step in range( run_time ):
                
                # Choose next action and reward
                states = self.current_values_lookup( self.value_function )
                current_decision = self.softmax( value_array = states , tau = self.tau)
                current_reward = bandits[current_decision][step]
                
                # Update value function and store decision and rewards 
                self.choices.append( current_decision )
                self.rewards.append( current_reward )
                self.update_value_functions( current_decision, current_reward, self.alpha  )
        
        # Run learning procedure for option epsilon greedy 
        if self.decision_function == "epsgreedy":
         
            for step in range( run_time ):
                
                # Choose next action and reward
                states = self.current_values_lookup(self.value_function)
                decision = self.epsilon_greedy( value_array = states , tau = self.epsilon)
                current_reward = bandits[current_decision][step]
                
                # Update value function and store decision and rewards 
                self.choices.append( current_decision )
                self.rewards.append( current_reward )
                self.update_value_functions( current_decision, current_reward, self.alpha  )
        
    """ 
    AUXILLIARY FUNCTIONS
    """
    
    # Wrapper function for sampling randomly TRUE or FALSE with probability p
    def random_bool(self, p = 0.1 ):
        number = np.random.binomial( 1 , p, 1 )
        result = number.tolist()[0] 
        return result

    # Wrapper for sampling a random number from an array with corresponding probabilities
    def weighted_sample(self, items, probability ):
        number = np.random.choice(items,1,p=probability)
        rchoice = number.tolist()[0]
        return rchoice   
    
    # Inititialize n value storages 0 or specified initial values
    def init_value_container(self, vfcount , init_val = None ):
        max_itter = vfcount
        value_functions = []
        if init_val == None:
            init_val = [0] * max_itter
        for i in range( max_itter ):
            temp_val = init_val[i]
            temp_list = [temp_val]
            value_functions.append(temp_list)
        return value_functions

    # Wrapper for looking up the last value in each value_function ( == last column of list of list )
    def current_values_lookup(self, listoflist ):
        cols = len(listoflist[0]) - 1
        last_value = [row[cols] for row in listoflist]
        return last_value
    
    # Update value functions according to choices rewards and alpha
    def update_value_functions(self, choice, reward, alpha ):
        for i in range(  self.n_bandits  ):
            current_value_function = self.value_function[i]
            old_value = current_value_function[-1]
            if i == choice:
                new_value = old_value + alpha * ( reward - old_value )
            else:
                new_value = old_value
            
            current_value_function.append(new_value)
        
    """ 
    BACK UP AND RESET
    """
        
    # Save current state of the value function, choices and experienced rewards                  
    def save_history( self , path = None ):
        
        # Define file names for storage objects for softmax decision function
        if self.decision_function == "softmax":
            str_alpha = "alpha_" + str(self.alpha)
            str_tau = "_tau_" + str(self.tau)
            file_type = ".txt"
            sufix = str_alpha + str_tau + file_type
            value_name = "valuefunction_" + sufix
            choice_name = "choices" + sufix
            reward_name = "reward" + sufix
        
        # Define file names for storage objects for epsilon greedy decision function 
        if self.decision_function == "epsgreedy":
            # Prepare File name - Parameters
            str_alpha = "alpha_" + str(self.alpha)
            str_epsilon = "_epsilon_" + str(self.epsilon)
            file_type = ".txt"
            sufix = str_alpha + str_epsilon + file_type
            value_name = "valuefunction_" + sufix
            choice_name = "choices" + sufix
            reward_name = "reward" + sufix  
        
        # Combine file names and path
        if path == None:
            path = os.getcwd() + "/"
        value_file = path + value_name
        choice_file = path + choice_name
        reward_file = path + reward_name
        
        # Save value functions, choice- and reward lists
        json.dump(self.value_function, file(value_file, 'w'))
        json.dump(self.choices, file(choice_file, 'w'))
        json.dump(self.rewards, file(reward_file, 'w'))
    
    # Clear value function of the agent      
    def re_init( self, init = None ):
        if init == None: 
            self.value_function = self.init_value_container(self.n_bandits)
        else:
            self.value_function = self.init_value_container(self.n_bandits,self.init)
        self.choices = []
        self.rewards = []

# 4) DEMO

In [15]:
# Intitialize agent
new_guy = agent(  decision_function = "softmax" )
new_guy.learn(100)
print new_guy.value_function
print new_guy.choices
print new_guy.rewards

# Show the developement of the value function for each bandit


[[0, 0.2725659852109284, 0.2725659852109284, 0.28511803014825665, 0.28511803014825665, 0.28511803014825665, 0.28511803014825665, 0.28511803014825665, 0.5138335114506427, -0.2352500977245917, -0.12899260995130937, 0.7137962911531474, 0.8602297329900761, 0.8602297329900761, 0.6164659811320252, 1.1000530023903852, 0.7065962699071017, 0.7065962699071017, -0.11022342984748901, -0.6454599038651646, -0.6454599038651646, -0.0996430892908945, -0.0996430892908945, -0.15660115668436042, 0.5289751347684943, -0.062684240834189, -0.08071177071732905, -0.08071177071732905, -0.08071177071732905, -0.08071177071732905, -0.08071177071732905, -0.08071177071732905, -0.11554055090322486, 0.06344601177447638, -0.7181860312616086, -0.7181860312616086, 0.013400220682818986, 0.14242605730291744, -0.7759899033231988, -0.7759899033231988, -0.9803408255336372, -0.9803408255336372, -0.9803408255336372, -0.7778941118977442, -0.7778941118977442, -0.7778941118977442, -0.7778941118977442, -1.0148801613196172, -1.014880