# 1) Import packages

In [1]:
import random as rd
import numpy as np
import os 
import json

# 2) Initialize class agent

In [2]:
class agent:
    
    """ 
    SET INITIAL CONDITIONS
    """
    
    def __init__( self, alpha = 0.5, epsilon = 0.01, tau = 0.1 , decision_function = "softmax", \
                  reward_input = None, mu = [0,0] , sigma = [1,1], init = None, time = 10, save = False ):
        
        # PARAMETERS
        self.alpha = alpha
        self.epsilon = epsilon
        self.tau = tau
        
        # DECISION FUNCTION
        self.decision_function = decision_function
        
        # BANDITS
        if reward_input == None:
            self.mu = mu
            self.sigma = sigma
            self.reward_set = None
            self.n_bandits = len( self.mu )
        else:  
            self.reward_set = reward_input
            self.n_bandits = len( self.reward_set )
            
        # VALUE FUNCTION, REWARDS AND CHOICE LISTS
        if init == None: 
            self.value_function = self.init_value_container(self.n_bandits)
        else:
            self.value_function = self.init_value_container(self.n_bandits,self.init)
        self.choices = []
        self.rewards = []
        
        # OTHER 
        self.time = time
        self.save = save
    
    
    """ 
    LEARNING FUNCTIONS
    """
    
    # Softmax-Decision - Returns the next choice based on boltzman distrubution
    def softmax(self, value_array, tau):
        # Check for correct input specification 
        if tau == 0:
            print "Parameter can't be zero"
            return None
        bandits = range(self.n_bandits)
        numerator = np.exp( np.array( value_array ) ) / float( tau )
        denominator = sum( numerator )
        boltzman_distribution = numerator / denominator
        choice = self.weighted_sample(bandits,probability = boltzman_distribution)
        return choice
    
    # Epsilon-Greedy-Decision - Returns greedy vs. random choice
    def epsilon_greedy(self, value_array, probability ):
        choice_count = len( value_array )
        explore = random_bool( p = probability )
        if explore == 1:
            choice = np.random.choice(choice_count)
            return choice
        else:
            optimal = value_array.argmax()
            return optimal        
        
    """ 
    LEARNING FUNCTION
    """
    
    def learn(self, run_time = None ):
     
        if run_time == None:
            run_time = self.time
        if self.reward_set == None:
            bandits = self.create_bandits( mu = self.mu, sigma = self.sigma, N = run_time )
        else:
            bandits = self.reward_set
      
        if self.decision_function == "softmax":
            
            for step in range( run_time ):
                
                states = self.current_values_lookup( self.value_function )
                current_decision = self.softmax( value_array = states , tau = self.tau)
                current_reward = bandits[current_decision][step]
                self.choices.append( current_decision )
                self.rewards.append( current_reward )
               
                self.update_value_functions( current_decision, current_reward, self.alpha  )
                
        if self.decision_function == "epsgreedy":
         
            for step in range( run_time ):
                
                states = self.current_values_lookup(self.value_function)
                decision = self.epsilon_greedy( value_array = states , tau = self.epsilon)
                current_reward = bandits[current_decision][step]
                self.choices.append( current_decision )
                self.rewards.append( current_reward )
               
                self.update_value_functions( current_decision, current_reward, self.alpha  )
        
    """ 
    AUXILLIARY FUNCTIONS
    """
    
    # Wrapper function to compute randomly TRUE or FALSE with prob. p
    def random_bool(self, p = 0.1 ):
        number = np.random.binomial( 1 , p, 1 )
        result = number.tolist()[0] 
        return result

    # Wrapper for N- normal random number with mean mu and variance sigma
    def rnorm(self, mu, sigma, N ):
        number = np.random.normal(mu, sigma, N)
        result = number.tolist()
        return result
    
    # Sample a random number with specified distribution
    def weighted_sample(self, items, probability ):
        rchoice = np.random.choice(items,1,p=probability).tolist()[0]
        return rchoice   
    
    # Init an empty dictionary with 0 or specified initial values
    def init_value_container(self, vfcount , init_val = None ):
        max_itter = vfcount
        value_functions = []
        if init_val == None:
            init_val = [0] * max_itter
        for i in range(max_itter):
            temp_val = init_val[i]
            temp_list = [temp_val]
            value_functions.append(temp_list)
        return value_functions
    
    # Creates normally distributed N-arm bandits 
    def create_bandits(self, mu ,sigma, N ):
        # Check input parameters
        if len(mu) != len(sigma):
            print "Unsufficient parameter input"
            return None
        # define how many bandits to compute
        n_bandit = len( mu )
        bandits = []
        for i in range(n_bandit):
            number = self.rnorm( mu[i] , sigma[i] , N )
            bandits.append(number)
        return bandits
    
    # Looks up the last value in each value_function
    def current_values_lookup(self, listoflist ):
        cols = len(listoflist[0]) - 1
        last_value = [row[cols] for row in listoflist]
        return last_value
    
    # Update the value functions according to choices rewards and alpha
    def update_value_functions(self, choice, reward, alpha ):
        for i in range(  self.n_bandits  ):
            current_value_function = self.value_function[i]
            old_value = current_value_function[-1]
            if i == choice:
                new_value = old_value + alpha * ( reward - old_value )
            else:
                new_value = old_value
            
            current_value_function.append(new_value)
        
    """ 
    BACK UP AND RESET
    """
        
    # Save current state of the value function                   
    def save_history( self , path = None ):
        
        if self.decision_function == "softmax":
            # Prepare File name - Parameters
            str_alpha = "alpha_" + str(self.alpha)
            str_tau = "_tau_" + str(self.tau)
            file_type = ".txt"
            sufix = str_alpha + str_tau + file_type
            value_name = "valuefunction_" + sufix
            choice_name = "choices" + sufix
            reward_name = "reward" + sufix
        
        # Epsilon Greedy Output 
        if self.decision_function == "epsgreedy":
            # Prepare File name - Parameters
            str_alpha = "alpha_" + str(self.alpha)
            str_epsilon = "_epsilon_" + str(self.epsilon)
            file_type = ".txt"
            sufix = str_alpha + str_epsilon + file_type
            value_name = "valuefunction_" + sufix
            choice_name = "choices" + sufix
            reward_name = "reward" + sufix  
        
        if path == None:
            path = os.getcwd() + "/"
        value_file = path + value_name
        choice_file = path + choice_name
        reward_file = path + reward_name
        
        json.dump(self.value_function, file(value_file, 'w'))
        json.dump(self.choices, file(choice_file, 'w'))
        json.dump(self.rewards, file(reward_file, 'w'))
        
    # Clear value function of the agent      
    def re_init( self, init = None ):
        if init == None: 
            self.value_function = self.init_value_container(self.n_bandits)
        else:
            self.value_function = self.init_value_container(self.n_bandits,self.init)
        self.choices = []
        self.rewards = []

# 4) DEMO

In [3]:
# Intitialize agent
new_guy = agent( mu = [2,3], decision_function = "softmax" )
new_guy.learn(5)
print new_guy.value_function
print new_guy.choices
print new_guy.rewards

# Show the developement of the value function for each bandit


[[0, 0, 0, 0, 0, 0], [0, 1.5422577116687188, 2.069483382248861, 2.6905132101623215, 3.5471384246828794, 3.624777787982098]]
[1, 1, 1, 1, 1]
[3.0845154233374377, 2.5967090528290027, 3.3115430380757815, 4.403763639203437, 3.7024171512813173]


In [176]:
new_guy.save_history()