In [0]:
"""
N-step targets or N-step Q-values estimation.

The following two functions computes truncated Q-values estimates:

A) n_step_targets_missing

    treats missing terms as 0.

B) n_step_targets_max

    use maximum terms possible.
"""

import numpy as np
 
# if number of steps unavailable, missing terms treated as 0.
def n_step_targets_missing(epr, baselines, gamma, N):
  N = N+1
  targets = np.zeros_like(epr)    
  if N > epr.size:
    N = epr.size
  for t in range(epr.size):   
    print("t=", t)
    for n in range(N):
      print("n=", n)
      if t+n == epr.size:            
        print('missing terms treated as 0, break') # last term for those with insufficient steps.
        break # missing terms treated as 0
      if n == N-1: # last term
        targets[t] += (gamma**n) * baselines[t+n] # last term for those with sufficient steps
        print('last term for those with sufficient steps, end inner n loop')
      else:
        targets[t] += (gamma**n) * epr[t+n] # non last terms
  return targets 
  
# N steps return
# if number of steps unavailable, use max steps available.
def n_step_targets_max(epr, baselines, v_s_, gamma, N):
  N = N+1
  targets = np.zeros_like(epr)    
  if N > epr.size:
    N = epr.size
  for t in range(epr.size):  
    print("t=", t)
    for n in range(N):
      print("n=", n)
      if t+n == epr.size:            
        targets[t] += (gamma**n) * v_s_ # last term for those with insufficient steps.
        print('last term for those with INSUFFICIENT steps, break')
        break 
      if n == N-1: 
        targets[t] += (gamma**n) * baselines[t+n] # last term for those with sufficient steps
        print('last term for those with sufficient steps, end inner n loop')
      else:
        targets[t] += (gamma**n) * epr[t+n] # non last terms
  return targets 
  
N=2 # N steps
gamma=2
t=5
v_s_ = 10 # value of next state
epr=np.arange(t).reshape(t,1)
print("epr=", epr)
baselines=np.arange(t).reshape(t,1)
print("baselines=", baselines)

print('n_step_targets_missing:')
T = n_step_targets_missing(epr, baselines, gamma, N)
print(T)

print('n_step_targets_max:')
T = n_step_targets_max(epr, baselines, v_s_, gamma, N)
print(T)

epr= [[0]
 [1]
 [2]
 [3]
 [4]]
baselines= [[0]
 [1]
 [2]
 [3]
 [4]]
n_step_targets_missing:
t= 0
n= 0
n= 1
n= 2
last term for those with sufficient steps, end inner n loop
t= 1
n= 0
n= 1
n= 2
last term for those with sufficient steps, end inner n loop
t= 2
n= 0
n= 1
n= 2
last term for those with sufficient steps, end inner n loop
t= 3
n= 0
n= 1
n= 2
missing terms treated as 0, break
t= 4
n= 0
n= 1
missing terms treated as 0, break
[[10]
 [17]
 [24]
 [11]
 [ 4]]
n_step_targets_max:
t= 0
n= 0
n= 1
n= 2
last term for those with sufficient steps, end inner n loop
t= 1
n= 0
n= 1
n= 2
last term for those with sufficient steps, end inner n loop
t= 2
n= 0
n= 1
n= 2
last term for those with sufficient steps, end inner n loop
t= 3
n= 0
n= 1
n= 2
last term for those with INSUFFICIENT steps, break
t= 4
n= 0
n= 1
last term for those with INSUFFICIENT steps, break
[[10]
 [17]
 [24]
 [51]
 [24]]
