# 1) Import packages

In [2]:
import random as rd
import numpy as np
import os 
import json

# 2) Load auxilliary functions

In [19]:
# Wrapper for sampling a single number with probability p
def random_bool( p = 0.1 ):
    number = np.random.binomial( 1 , p, 1 )
    result = number.tolist()[0] 
    return result

# Wrapper for N- normal random number with mean mu and variance sigma
def rnorm( mu, sigma, N ):
    number = np.random.normal(mu, sigma, N)
    result = number.tolist()
    return result

# 3) Initialize class agent

In [3]:
class agent:
    
    # Set initial condition and default values
    def __init__( self, alpha = 0.5, epsilon = 0.01, tau = 0.1 , mu = [0,0] , sigma = [1,1], \
                  decision_function = "softmax",init = None , time = 10, save = False ):
        self.alpha = alpha
        self.epsilon = epsilon
        self.mu = mu
        self.tau = tau
        self.n_bandits = len(self.mu)
        self.sigma = sigma
        self.decision_function = decision_function
        if init == None: 
            self.value_function = self.init_dict(range(self.n_bandits))
        else:
            self.value_function = self.init_dict(range(self.n_bandits),self.init)
        self.choices = []
        self.rewards = []
        self.time = time
        self.save = save
    
    # Init an empty dictionary with 0 or specified initial values
    def init_dict(self, names , init_val = None ):
        max_itter = len(names)
        empty_dict = {}
        if init_val == None:
            init_val = [0] * max_itter
        for i in range(max_itter):
            temp_val = init_val[i]
            temp_list = [temp_val]
            temp_name = names[i]
            temp = {temp_name:temp_list}
            empty_dict.update(temp)
        return empty_dict

    # Create N-random bandits according to a Normal distribution with mean mu and variance sigma 
    # It returns a list of 
    def create_bandits(self, mu ,sigma, N ):
        # 
        if len(mu) != len(sigma):
            print "Unsufficient parameter input"
            return None
        n_bandit = len( mu )
        casino = []
        for i in range(n_bandit):
            number = rnorm( mu[i] , sigma[i] , N )
            casino.append(number)
        return casino
    
    # Finds the current maximum value in a dictionary
    def decide(self,value_dict ):
        run_time = len(value_dict)
        decission = []
        for i in range(run_time):
            temp = value_dict.get(i)[-1]
            decission.append(temp)
        index = decission.index(max(decission))
        return index
    
    # Updates the dictionary
    def update_dict(self, dic, next_choice, value, alpha ):
        full_range = range(len(dic))
        for key in full_range:
            temp = dic.get(key)
            if key == next_choice:
                temp_val = temp[-1]
                new_value = temp_val + alpha *( value - temp_val)
                temp.append(new_value)
            else:
                temp_val = temp[-1]
                temp.append(temp_val)
            
    # Start learning procedure
    def learn(self, run_time = None ):
        
        if run_time == None:
            run_time = self.time
            
        bandits = self.create_bandits( mu = self.mu, sigma = self.sigma, N = run_time )
        choice_count = len( self.mu )
        
        
        for step in range( run_time ):
    
            explore = random_bool( p = self.epsilon )

            if explore == 1:
               
                choice = np.random.choice(choice_count)
                reward = bandits[choice][step]
                self.update_dict( self.value_function, choice, reward,self.alpha )
                
            else:
                
                optimal = self.decide( self.value_function )
                reward = bandits[optimal][step]
                self.update_dict( self.value_function, optimal, reward,self.alpha )
               
    # Save current state of the value function                   
    def save_history( self, value_dict = None , name = None , path = None ):
        if value_dict == None:
            value_dict = self.value_function
        if name == None:
            # Prepare File name - Parameters
            str_alpha = "_alpha_" + str(self.alpha)
            str_epsilon = "_epsilon_" + str(self.epsilon)
            file_type = ".txt"
            name = str_alpha + str_epsilon + file_type
        if path == None:
            path = os.getcwd()
        value_file = path + name
        json.dump(value_dict, file(value_file, 'w'))
        
    # Clear value function of the agent      
    def re_init( self, init = None ):
        if init == None: 
            self.value_function = self.init_dict(range(self.n_bandits))
        else:
            self.value_function = self.init_dict(range(self.n_bandits),self.init)
     

# 4) DEMO

In [4]:
# Intitialize agent
g=agent( mu = [2,10] )

# 
g.learn(20)

# Show the developement of the value function for each bandit
g.value_function

{0: [0,
  0.20529088533038675,
  1.0897459783812962,
  1.5633481341889626,
  1.2375730852676958,
  1.2783302559596021,
  2.595261904135506,
  2.518821032720984,
  1.8356021073765385,
  2.317913822017384,
  2.511202381936765,
  3.0981224424722544,
  2.209067481191311,
  2.0855810243815585,
  2.3722756543582184,
  2.8293922762612835,
  2.576157006509429,
  2.0089682292613076,
  2.0089682292613076,
  2.0089682292613076,
  2.0089682292613076],
 1: [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5.490751577067243,
  7.036034023359788,
  8.436989095067638]}

In [None]:
# Save current history
g.save_history()

In [None]:
# Re initialize 
g.re_init()
g.value_function

In [2]:
import math
math.exp(1)

2.718281828459045

In [4]:
np.exp(1)

2.7182818284590451

In [8]:
vec=rnorm(3,4,10)

In [4]:
a = np.array(np.array([1,2,3])) / 3.

In [13]:
a

array([ 0.33333333,  0.66666667,  1.        ])

In [12]:
def softmax( value_array, tau):
    if tau == 0:
        print "Parameter can't be zero"
        return None
    numerator = np.exp( np.array( value_array ) ) / float( tau )
    denominator = sum( numerator )
    boltzman_distribution = numerator / denominator
    choice = boltzman_distribution.argmax()
    return choice

In [28]:
def epsilon_greedy( value_array, probability ):
    choice_count = len( value_array )
    explore = random_bool( p = probability )
    if explore == 1:
        choice = np.random.choice(choice_count)
        return choice
    else:
        optimal = value_array.argmax()
        return optimal

1

In [None]:
def learn(self, run_time = None, reward_set = None, decision_rule = None ):
        
        if run_time == None:
            run_time = self.time
            
        if reward_set == None:
            bandits = self.create_bandits( mu = self.mu, sigma = self.sigma, N = run_time )
        else:
            bandits = reward_set
        
        
        choice_count = len( self.mu )
        
        
        for step in range( run_time ):
            

In [30]:
lol = [[1,23],[5,5],[23,5]]

In [31]:
lol

[[1, 23], [5, 5], [23, 5]]

TypeError: list indices must be integers, not tuple