In [1]:
"""
Author:       Erfan Azad (erfan@dartmouth.edu)
Date:         02 February 2017
Description:  Simulation for SARSA-Lambda algorithm for a
              random walk in n states  
"""

'\nAuthor:       Erfan Azad (erfan@dartmouth.edu)\nDate:         02 February 2017\nDescription:  Simulation for SARSA-Lambda algorithm for a\n              random walk in n states  \n'

In [32]:
import numpy as np
import matplotlib.pyplot as plt
import time

In [33]:
def choose_eps_greedy_action(state_actions, currentState, eps):
    '''
    Recieves a list of states and their actions, 
    and selects an action in a eps-greedy fashion
    for the current state using the given epsilon.
    
    Args:
        state_actions: nd-array of states and their action values.
        
        currentState: The state that the action will be selected for.
        
        eps: Epsilon variable used in epsilon-greedy action selection.
        
    Returns:
        The index of the action-value corresponding to an action.
        e.g. 1,2,.. (Note: 0 is the index for the state itself)
    '''
    assert (currentState > 0 and currentState < state_actions.shape[0]-1)
    if (np.random.random() > eps):
        action = np.argmax(state_actions[currentState,1:]) + 1 # column of the Q corresponding to the greedy action in the state_actions
    else:
        action = np.random.choice([1,2])  # ACTION 1 or ACTION 2
    return action

In [34]:
def calculate_reward(st, numStates):
    '''
    Calculates the reward for the 1-D random walk problem.
    
    Args:
        st: The state that the agent is heading to.
        
        numStates: Total number of states including the terminal
                states.
    
    Returns:
        The reward of
        -1 for state 0,
        +1 for the right most state (index = numStates - 1)
        0 for any other state in between.
    '''
    reward = None
    if (st != 0 and st != numStates-1):
        reward = 0
    elif (st == 0):
        reward = -1
    else:
        reward = 1
    return reward

In [35]:
def takeStep(state_actions, currentState, numStates, epsilon):
    '''
    Takes a step in the episode.
    
    Args:
        state_actions: a (numStates x 3) size array representing
                    each state s and its action-values:  Q(s,a)
        
        currentState: current state that we are taking the step from.
        
        epsilon: epsilon variable used in the epsilon-greedy action selection.
        
    Returns: 
        Action taken, observed reward, and the next state in form of [a, r, s_next]
    '''
    action = choose_eps_greedy_action(state_actions, currentState, epsilon) # Choose and action in current state 
    next_st = (currentState + 1 if action==2 else currentState -1)          # Observe next state
    reward = calculate_reward(next_st, numStates)                           # Observe the reward
    
    return [action, reward, next_st]

In [36]:
def Q_Lambda_Watkins(lambda_factor, gamma, alpha, epsilon, numStates, numEpisodes):
    """
    Runs the Q-Lambda Watkins learning algorithm for 
    the random walk problem. It used the online
    version such that it will update the
    action-values, Q(s,a), at each step of the episode.
    
    Args:
        lambda_factor:      decay factor ("lambda" is a keyword in python!)
        gamma:       discount factor
        alpha:       learning rate
        numStates:   number of states (including the terminal states)
        numEpisodes: number of times/ episodes to repeat the learning
    
    Returns:
        The learned values of each state: V(s)
    """
    state_actions = np.vstack((np.arange(numStates), np.zeros((2,numStates)))).T # e.g. row0 ==> [S0,Q1,Q2] for numState rows
    e_trace = np.zeros((numStates, 2))
    for i in range(numEpisodes):
        currentState = numStates /2 # Start from the middle
        while (currentState >0  and currentState < numStates-1):
            action, reward, nextState = takeStep(state_actions, currentState, numStates, epsilon) # Take action, observe reward and the nextState
            Q = state_actions[currentState, action]
            try:
                nextAction =  choose_eps_greedy_action(state_actions, nextState, epsilon)         # Choose nextAction from nextState
                optimal_nextAction = np.argmax(state_actions[nextState,1:]) + 1
                Q_next = state_actions[nextState, optimal_nextAction]
            except AssertionError:
                Q_next = 0  # nextState is a Terminal State
                nextAction = None
                optimal_nextAction = None
                                               
            err = reward + gamma*Q_next - Q
            e_trace[currentState, action - 1] += 1
                                               
            # For all s, a update the Q(s,a) and e_trace(s,a)
            print(err)
            state_actions[:,1:] += alpha*err*e_trace
            if (optimal_nextAction != None and optimal_nextAction == nextAction):                               
                e_trace = gamma*lambda_factor*e_trace
            else:
                e_trace = e_trace = np.zeros((numStates, 2))
            # Update currentState
            currentState = nextState
    return np.round(state_actions, 3)

In [None]:
t1 = time.time()
result = Q_Lambda_Watkins(lambda_factor=0.5,gamma=0.9, alpha=0.1, epsilon=0.2, numStates=45, numEpisodes=500)
t2 = time.time()
print(result)
print("Finished in {} seconds.".format(t2-t1))