In [2]:
"""
Author:       Erfan Azad (erfan@dartmouth.edu)
Date:         25 January 2017
Description:  Simulation for TD-Lambda algorithm for a
              random walk in n states  
"""

'\nAuthor:       Erfan Azad (erfan@dartmouth.edu)\nDate:         25 January 2017\nDescription:  Simulation for TD-Lambda algorithm for a\n              random walk in n states  \n'

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import time
import math

In [4]:
def buildEpisode(numStates):
    """
    Builds an episode of the TD_Lambda
    simulation. Starting form the middle state

    Args:
        numStates: Number of states.

    Returns:
        An array of [visitedRewards, visitedStates], which
        contain the rewards achieved and states
        that were visited.
    """
    assert(numStates > 2)
    st = numStates/2     # start from the middle state
    rew = 0
    visitedStates = [st]
    visitedRewards = []
    while(st > 0 and st < numStates-1):
        if(np.random.random() > 0.5):
            st += 1
        else:
            st -= 1
        if(st != 0 and st != numStates-1):
            rew = 0
        elif(st == 0):
            rew = -1
        else:
            rew = 1
        visitedStates.append(st)
        visitedRewards.append(rew)
    return [visitedStates, visitedRewards]

In [64]:
def TDL_offline_learn_forward(lambda_factor, gamma, alpha, numStates, numEpisodes):
    """
    Runs the TD-Lambda learning algorithm for 
    the random walk problem. (forward view)
    
    Args:
        lambda_factor:      decay factor ("lambda" is a keyword in python!)
        gamma:       discount factor
        alpha:       learning rate
        numStates:   number of states (including the terminal states)
        numEpisodes: number of times/ episodes to repeat the learning
    
    Returns:
        The learned values of each state: V(s)
    """
    V = np.zeros((1,numStates))
    for i in range(numEpisodes):
        visitedStates ,visitedRewards = buildEpisode(numStates)
        T = len(visitedRewards) # Final time index
        deltaV = np.zeros((1,numStates))
        for t in range(0,T):
            Rtn = np.zeros((1,T-t))
            for n in range(1,T-t+1):
                max_n = min(n, T-t)
#                 print("max_n:{}, t:{}, n:{}, T:{}".format(max_n, t, n, T))
                gammaPowers = np.arange(0,max_n)
                Rs = np.array(visitedRewards[t:t+max_n]) #rewards needed to built Rtn with current t and n
                Rtn[0,n-1] = np.dot(np.power(gamma, gammaPowers), Rs) + pow(gamma,max_n)*V[0,visitedStates[t+max_n]]
            lambdaPowers = np.arange(0,Rtn.shape[1])
            Rtl = (1-lambda_factor)*np.dot(np.power(lambda_factor, lambdaPowers)[0:-1], Rtn[0,0:-1]) + pow(lambda_factor,lambdaPowers[-1])*Rtn[0,-1]   
            # off-line algorithm --> save the deltas and add at the end of the episode
            deltaV[0,visitedStates[t]] += alpha*(Rtl - V[0,visitedStates[t]])
        V = V + deltaV
    return V

In [66]:
t1 = time.time()
stateValues = TDL_offline_learn_forward(0.5,0.9,0.1,5,10000)
t2 = time.time()
print(stateValues)
print("Performance Time: {} seconds".format((t2-t1)))

[[ 0.         -0.54813698 -0.06276626  0.37311259  0.        ]]
Performance Time: 3.95213389397 seconds


In [55]:
def TDL_offline_learn_backward(lambda_factor, gamma, alpha, numStates, numEpisodes):
    """
    Runs the TD-Lambda learning algorithm for 
    the random walk problem. (Backward View)
    
    Args:
        lambda_factor:      decay factor ("lambda" is a keyword in python!)
        gamma:       discount factor
        alpha:       learning rate
        numStates:   number of states (including the terminal states)
        numEpisodes: number of times/ episodes to repeat the learning
    
    Returns:
        The learned values of each state: V(s)
    """
    V = np.zeros((1,numStates))
    e = np.zeros((1,numStates))
    for i in range(numEpisodes):
        visitedStates ,visitedRewards = buildEpisode(numStates)
        deltaV = np.zeros((1,numStates))
        T = len(visitedRewards)
        err = 0
        for t in range(0,T):
            err = visitedRewards[t] + gamma*V[0,visitedStates[t+1]] - V[0,visitedStates[t]]
            e[0,visitedStates[t]] += 1
            
            deltaV = deltaV + alpha*err*e
            e = gamma*lambda_factor*e
        V = V + deltaV
    return V
        
        

In [70]:
t1 = time.time()
stateValues = TDL_offline_learn_backward(0.5,0.9,0.1,5,10000)
t2 = time.time()
print(stateValues)
print("Performance Time: {} seconds".format((t2-t1)))

[[ 0.         -0.46342452 -0.00082637  0.56701469  0.        ]]
Performance Time: 0.68082690239 seconds
