In [2]:
import random
from matplotlib import pyplot as plt
import numpy as np
from copy import deepcopy
from math import sqrt

In [3]:
STATES = 7
EPISODES = 100
RUNS = 100
GAMMA = 1
VPI = ((1/6),(2/6),(3/6),(4/6),(5/6))


In [4]:
def walk(start):
    trajectory = []
    s = start
    while(True):
        a  = 1 if random.randint(0,1) == 1 else -1
        #append state, action, reward, only reward if get to far right terminal state
        sPrime = s + a
        trajectory.append([s,a,1 if sPrime == STATES-1 else 0,sPrime])
        s = sPrime
        if s == 0 or s == STATES-1:
            break
    return trajectory

In [5]:
def estimatedValue(V, alpha, recordedEpisodes):
    recordings = [deepcopy(V[1:len(V)-1])]
    for i in range(EPISODES):
        start = 3
        trajectory = walk(start)
        for step in trajectory:
            s, a, r, sPrime = step[0],step[1],step[2],step[3]
            V[s] = V[s] + alpha * (r + (GAMMA*V[sPrime]) - V[s])
        if i+1 in recordedEpisodes:
            recordings.append(deepcopy(V[1:len(V)-1]))
    return recordings

In [6]:
def figure1():
    plt.figure(figsize=(10,10))
    V = [0,0.5,0.5,0.5,0.5,0.5,0]
    alpha = .1
    recordedEpisodes = [0,1,10,100]
    recordings = estimatedValue(V,alpha,recordedEpisodes)
    for recording,runNum in zip(recordings,recordedEpisodes):  
        plt.plot(recording,label=runNum,marker='o')
        
        # plt.legend(recordedEpisodes)  
    plt.plot(VPI,label="True Value",marker='o')  
    plt.xlabel('Nonterminal State')
    plt.ylabel('Estimated Value')

    x = np.array([0,1,2,3,4])
    my_xticks = ['A','B','C','D','E']
    plt.xticks(x, my_xticks)

    plt.legend()
    plt.savefig('../figures/example6-2/RandomWalkEstimatedValue.png')    
    plt.close()

In [7]:
def rms(numbers,expecteds):
    total = 0
    for number,expected in zip(numbers,expecteds):
        total+=(number-expected)**2
    return sqrt((1/len(numbers)) * total)

In [8]:
def td(episodes,alpha,V):
    Vts = []
    for _ in range(episodes):
        start = 3
        trajectory = walk(start)
        for step in trajectory:
            s, a, r, sPrime = step[0],step[1],step[2],step[3]
            V[s] = V[s] + alpha * (r + (GAMMA*V[sPrime]) - V[s])
        Vts.append(rms(V[1:len(V)-1],VPi))
    return Vts


In [9]:
VPi = ((1/6),(2/6),(3/6),(4/6),(5/6))
def mcEvery(episodes, alpha, V):
    Vts = []
    for _ in range(episodes):
        start = 3
        trajectory = walk(start)
        G = 0
        for i, step in enumerate(reversed(trajectory)):
            s, r = step[0],step[2]
            G = G * GAMMA + r
            V[s] = V[s] + alpha*(G - V[s]) 
        Vts.append(rms((V[1:len(V)-1]),VPi))
    return Vts



In [10]:
def figure2():
    plt.figure(figsize=(10, 10))
    lines = []
    alphas = [.15,.1,.05, .01,.02,.03,.04]
    evaluations = ['TD','TD','TD','MC','MC','MC','MC']
    for alpha,evaluation in zip(alphas,evaluations):
        averageRun = np.zeros(EPISODES)
        for i in range(RUNS):
            V = [0,0.5,0.5,0.5,0.5,0.5,0]
            if (evaluation == 'TD'):
                run = td(EPISODES,alpha,V)
            elif (evaluation == 'MC'):
                run = mcEvery(EPISODES,alpha,V)
            for i in range(len(run)):
                averageRun[i] += (1/(i+1) * (run[i] - averageRun[i]))
        lines.append(averageRun)

    for line,alpha,evaluation in zip(lines,alphas,evaluations):  
        plt.plot(line,label=evaluation + ": " + str(alpha))
    plt.xlabel("Walks / Episodes")
    plt.ylabel("RMS Error Averaged Over {} Runs".format(RUNS))
    plt.legend()
    plt.savefig('../figures/example6-2/RandomWalkRMSE.png')    
    plt.close()



    

In [11]:
figure1()
figure2()

In [33]:
#Below is for exercise 6-5
def figure3():
    fig,axes = plt.subplots(4,2)
    fig.set_figheight(40)
    fig.set_figwidth(20)
    initialValues = [.25,.4,.45,.5,.55,.6,.75,.9]
    for iVindex,iV in enumerate(initialValues):
        x,y = iVindex//2,iVindex%2
        lines = []
        alphas = [.15,.1,.05, .01,.02,.03,.04]
        evaluations = ['TD','TD','TD','MC','MC','MC','MC']
        for alpha,evaluation in zip(alphas,evaluations):
            averageRun = np.zeros(EPISODES)
            for i in range(RUNS):
                V = [0,iV,iV,iV,iV,iV,0]
                if (evaluation == 'TD'):
                    run = td(EPISODES,alpha,V)
                elif (evaluation == 'MC'):
                    run = mcEvery(EPISODES,alpha,V)
                for i in range(len(run)):
                    averageRun[i] += (1/(i+1) * (run[i] - averageRun[i]))
            lines.append(averageRun)        
        for line,alpha,evaluation in zip(lines,alphas,evaluations):  
            axes[x,y].plot(line,label=evaluation + ": " + str(alpha))
            axes[x,y].legend()
            axes[x,y].set_title("Random Walk RMSE Inflection Initial " + str(iV))
            axes[x,y].set_xlabel("Walks / Episodes")
            axes[x,y].set_ylabel("RMSE Averaged Over {} Runs".format(RUNS))
    fig.savefig('../figures/exercise6-5/RandomWalkRMSEInitials.png')
    plt.close()

In [34]:
figure3()