## Temporal Differance ( TD(0) )
#### Incremental MC
    * Reccurence formula of average calculation
$$\mu_{k} = \mu_{k-1}+{{1}\over{k}}(x_{k}-\mu_{k-1}) $$
    * MC(stationary)
$$ V(S_{t})\: \leftarrow\:V(S_{t})\:+\:{1\over{N(S)}}\:*\:(G_{t}-V(S_{t}))$$
    * MC(non-stationary)
$$V(S_{t})\: \leftarrow\:V(S_{t})\:+\:\alpha\:*\:(G_{t}-V(S_{t}))$$
#### TD(0)
    * Instead of waiting utill simulation finishes, update value function step by step

$$V(S_{t})\: \leftarrow\:V(S_{t})\:+\:\alpha\:*\:(R_{t+1}\:+\:\gamma V(S_{t+1})-V(S_{t}))$$
    * TD target is bias estimation of real $G_{t}$ and has low variance  

## Source Code 
#### Grid World Enviroment setting
* states, actions, transition probability

In [2]:
# set state
import numpy as np
nCols = 3
nRows = 4
nWalls = 1
states = []
for i in range(nCols*nRows-nWalls):
    states.append(i)
N_STATES = len(states)
#print(N_STATES)
#print(states)

# set map
map = -np.ones((nCols+2,nRows+2))
for i in range(nCols):
    for j in range(nRows):
        map[i+1,j+1] = 0
map[2,2] = -1 # add wall
#print(map)

# set action
actions = [0, 1, 2, 3]
N_ACTIONS = len(actions)

# states -> location
locations = []
index = 0
for i in range(nCols):
    for j in range(nRows):
        if map[i+1,j+1]==0:
            locations.append((i+1,j+1))
            index = index + 1
#print(locations) # match index with states
# action -> move
move = [(0,-1),(-1,0),(0,1),(1,0)] # match index with actions
#print(move)

# set transition probability
P = np.zeros((N_STATES,N_ACTIONS,N_STATES)) # P[S,A,S']
for s in range(N_STATES):
    for a in range(N_ACTIONS):
        current_location = locations[s]
        # heading collectly  ####################################################################################
        next_location = (current_location[0] + move[a][0],current_location[1] + move[a][1])
        
        if map[next_location[0],next_location[1]] == -1: # there is barrier or wall
            next_location = current_location
            next_s = states[locations.index(next_location)]
        else:
            next_s = states[locations.index(next_location)]
        P[s,a,next_s] = P[s,a,next_s] + 0.8
        # left error ############################################################################################
        next_location = (current_location[0] + move[a-1][0],current_location[1] + move[a-1][1])
        if map[next_location[0],next_location[1]] == -1: # there is barrier or wall
            next_location = current_location
            next_s = states[locations.index(next_location)]
        else:
            next_s = states[locations.index(next_location)]
        P[s,a,next_s] = P[s,a,next_s] + 0.1
        # right error ############################################################################################
        next_location = (current_location[0] + move[(a+1)%4][0],current_location[1] + move[(a+1)%4][1])
        
        if map[next_location[0],next_location[1]] == -1: # there is barrier or wall
            next_location = current_location
            next_s = states[locations.index(next_location)]
        else:
            next_s = states[locations.index(next_location)]
        P[s,a,next_s] = P[s,a,next_s] + 0.1
        
# rewards s,a ---  R(s,a)  ---> s'
if True:
    R = -0.02*np.ones((N_STATES,N_ACTIONS))
else:
    R = -0.5*np.ones((N_STATES,N_ACTIONS))
R[3,:] = 1
R[6,:] = -1
#print(R)
# discount factor
gamma = 0.99

# policy : given state which action would u choose
# assume that we know the policy
bad_policy = np.zeros((N_STATES,N_ACTIONS))
bad_policy[0,2] = 1
bad_policy[1,2] = 1
bad_policy[2,2] = 1
bad_policy[3,2] = 1
bad_policy[4,3] = 1
bad_policy[5,2] = 1
bad_policy[6,2] = 1
bad_policy[7,2] = 1
bad_policy[8,2] = 1
bad_policy[9,2] = 1
bad_policy[10,1] = 1

random_policy = 0.25*np.ones((N_STATES,N_ACTIONS))

optimal_policy = np.zeros((N_STATES,N_ACTIONS))
optimal_policy[0,2] = 1
optimal_policy[1,2] = 1
optimal_policy[2,2] = 1
optimal_policy[3,2] = 1
optimal_policy[4,1] = 1
optimal_policy[5,1] = 1
optimal_policy[6,1] = 1
optimal_policy[7,1] = 1
optimal_policy[8,0] = 1
optimal_policy[9,0] = 1
optimal_policy[10,0] = 1
#print(optimal_policy)

optimalWithNoise_policy = np.zeros((N_STATES,N_ACTIONS))
ep = 0.1
optimalWithNoise_policy[0,2] = 1
optimalWithNoise_policy[1,2] = 1
optimalWithNoise_policy[2,2] = 1
optimalWithNoise_policy[3,2] = 1
optimalWithNoise_policy[4,1] = 1
optimalWithNoise_policy[5,1] = 1
optimalWithNoise_policy[6,1] = 1
optimalWithNoise_policy[7,1] = 1
optimalWithNoise_policy[8,0] = 1
optimalWithNoise_policy[9,0] = 1
optimalWithNoise_policy[10,0] = 1
optimalWithNoise_policy = optimalWithNoise_policy + (ep/4)*np.ones((N_STATES,N_ACTIONS))
optimalWithNoise_policy = optimalWithNoise_policy / np.sum(optimalWithNoise_policy,axis = 1).reshape((N_STATES,1))

In [3]:
## TD(0) for V

## set Hyper parameters
epoch = 10000
alpha = 0.01

## set boundary condition
V = np.zeros(N_STATES)
V[3] = 1.0; #goal
V[6] = -1.0; #fail
## states
terminal_states =[3,6]
start_states = [x for x in states if x not in terminal_states]
## set policy
policy = optimalWithNoise_policy

for _ in range(epoch):
    done = False
    s = np.random.choice(start_states) # random initial state
    while not done:
        # s,a,r,s'
        a = np.random.choice(actions,p=policy[s,:])
        reward = R[s,a]
        s1 = np.random.choice(states,p=P[s,a,:])
        TD_target = reward + gamma * V[s1]
        V[s] += alpha*(TD_target-V[s])
        if (s1==3) or (s1==6):
            done = True
        else:
            s = s1
        
        
print(V)

[ 0.82866665  0.87919271  0.92157334  1.          0.79087506  0.57785063
 -1.          0.75080509  0.71718853  0.66761782  0.39857905]


In [4]:
## TD(0) for Q
## set Hyper parameters
epoch = 10000
alpha = 0.01

## set boundary condition
Q = np.zeros((N_STATES,N_ACTIONS))
Q[3,:] = 1.0; #goal
Q[6,:] = -1.0; #fail
## states
terminal_states =[3,6]
start_states = [x for x in states if x not in terminal_states]
## set policy
policy = optimalWithNoise_policy

for _ in range(epoch):
    done = False
    s = np.random.choice(start_states) # random initial state
    a = np.random.choice(actions,p=policy[s,:]) # random initial action
    while not done:
        # s,a,r,s',a'
        reward = R[s,a]
        s1 = np.random.choice(states,p=P[s,a,:])
        a1 = np.random.choice(actions,p=policy[s1,:])
        TD_target = reward + gamma * Q[s1,a1]
        Q[s,a] += alpha*(TD_target-Q[s,a])
        if (s1==3) or (s1==6):
            done = True
        else:
            s = s1
            a = a1
        
        
print(Q)

[[ 0.60470404  0.65359558  0.81632922  0.58195965]
 [ 0.668624    0.71808405  0.87274739  0.73524075]
 [ 0.74604539  0.82272036  0.93766611  0.6181517 ]
 [ 1.          1.          1.          1.        ]
 [ 0.54817606  0.75965256  0.57324682  0.50477549]
 [ 0.30544591  0.69986116 -0.36032707  0.19435391]
 [-1.         -1.         -1.         -1.        ]
 [ 0.42755831  0.69230416  0.39395507  0.35777611]
 [ 0.62701724  0.30697542  0.29923145  0.33514213]
 [ 0.56696604  0.25695275  0.11085974  0.13837313]
 [ 0.36797657 -0.25446626  0.03098838  0.05335194]]


## TD($\lambda$)
#### Forward View(offline)
    * Gemetry sumation of goal(Expected cummulative reward)
$$G_{t}^{\lambda}    = (1-\lambda)
    \sum_{n=1}^{\infty} \lambda^{n-1}G_{t}^{(n)}$$
$$V(S_{t})\: \leftarrow\:V(S_{t})\:+\:\alpha\:*\:(G_{t}^{\lambda}-V(S_{t}))$$
    * it is hard to implement with this form(high complexity and memory uses)
#### Backward View(online)
    * Eligibility traces
$$ E_{t}(s) = \gamma \lambda E_{t-1}(s)+1(S=S_{t})$$
    
    * TD lambda
$$V(S) \leftarrow V(S)+ \alpha \delta_{t} E_{t}(S) $$  
<center>  </center>
<center>where $\delta_{t} = R_{t+1}\:+\:\gamma V(S_{t+1})-V(S_{t})$</center>

In [5]:
## TD(lamda) for V

## set Hyper parameters
epoch = 1000
alpha = 0.01
lam = 0.5

## set boundary condition
V = np.zeros(N_STATES)
V[3] = 1.0; #goal
V[6] = -1.0; #fail

## states
terminal_states =[3,6]
start_states = [x for x in states if x not in terminal_states]
## set policy
policy = optimalWithNoise_policy
E_history = np.zeros((100,N_STATES))
for _ in range(epoch):
    done = False
    
    #set Eligibilty traces
    E = np.zeros(N_STATES)
    
    s = np.random.choice(start_states) # random initial state
    t = 1
    while not done:
        # s,a,r,s'
        a = np.random.choice(actions,p=policy[s,:])
        reward = R[s,a]
        s1 = np.random.choice(states,p=P[s,a,:])
        

        TD_target = reward + gamma * V[s1]
        TD_error = TD_target-V[s]
        E[s]+=1
        
        for state in start_states:
            V[state] += alpha*TD_error*E[state]
            E[state] = gamma*lam*E[state]
        E_history[t,:]=E 
        t +=1
    
        if (s1==3) or (s1==6):
            done = True
            
        else:
            s = s1
           
        
print(V)

[ 0.83401321  0.89166407  0.92472041  1.          0.78134167  0.57062885
 -1.          0.67997237  0.56763827  0.3895605   0.03745909]


In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(20, 20)) 

for i in states:
    plt.subplot(4,3,i+1).set_title("Eligibility Traces for state {}".format(str(i)))
    plt.plot(E_history[:,i])
    plt.legend(['state {}'.format(str(i))])

plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [7]:
## TD(lamda) for Q

## set Hyper parameters
epoch = 5000
alpha = 0.01
lam = 0.5

## set boundary condition
Q = np.zeros((N_STATES,N_ACTIONS))
Q[3,:] = 1.0; #goal
Q[6,:] = -1.0; #fail

## states
terminal_states =[3,6]
start_states = [x for x in states if x not in terminal_states]
## set policy
policy = optimalWithNoise_policy

for _ in range(epoch):
    done = False
    
    #set Eligibilty traces
    E = np.zeros((N_STATES,N_ACTIONS))
    
    s = np.random.choice(start_states) # random initial state
    a = np.random.choice(actions,p=policy[s,:]) # random initial action
    while not done:
        # s,a,r,s',a'
        
        reward = R[s,a]
        s1 = np.random.choice(states,p=P[s,a,:])
        a1 = np.random.choice(actions,p=policy[s1,:])

        TD_target = reward + gamma * Q[s1,a1]
        TD_error = TD_target - Q[s,a]
        E[s,a]+=1
        
        for state in start_states:
            for action in actions:
                Q[state,action] += alpha*TD_error*E[state,action]
                E[state,action] = gamma*lam*E[state,action]
                
        if (s1==3) or (s1==6):
            done = True
        else:
            s = s1
            a = a1
           
        
print(Q)

[[ 0.42257604  0.50048387  0.80205974  0.41270311]
 [ 0.52474496  0.57978962  0.87128712  0.54896926]
 [ 0.60896887  0.68688252  0.90578188  0.46352214]
 [ 1.          1.          1.          1.        ]
 [ 0.37803751  0.75240565  0.42872495  0.38886904]
 [ 0.15089564  0.63712353 -0.16239544  0.02602443]
 [-1.         -1.         -1.         -1.        ]
 [ 0.28029129  0.69267805  0.21952781  0.28367253]
 [ 0.63503676  0.2378022   0.21004956  0.2195161 ]
 [ 0.56939818  0.17418369  0.07737392  0.11467944]
 [ 0.3869438  -0.10811413 -0.00533715  0.04231542]]
