In [95]:
import numpy as np
import matplotlib.pyplot as plt

# Slicing

In [78]:
x = np.array([[1, 2], [3, 4], [5, 6]])
a = np.array([0, 1, 2]), 
b = np.array([0, 1, 0])
x[a,b]

array([[1, 4, 5]])

In [81]:
q = np.arange(19)
p = np.ones((19,2))
a = np.array([0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1])
l = [0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1]
n = np.arange(19)
na = np.vstack((n,a)).reshape(19,2)
print(q)
print(q[5:])
print(q[::5])
print(q[::-1])
print(p.shape)
print(p[:,0].shape)
print(p[:,1].shape)
print(na.shape)
print(p[n,a].shape)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18]
[ 5  6  7  8  9 10 11 12 13 14 15 16 17 18]
[ 0  5 10 15]
[18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0]
(19, 2)
(19,)
(19,)
(19, 2)
(19,)


# GAE

Generalized Advantage Estimate

In [88]:
rewards = np.ones(10)
n_step = 0.95
GA = np.sum([sum(rewards[:i+1])*((1-n_step)*n_step**i) for i in range(rewards.shape[0])])
print(type(GA))

<class 'numpy.float64'>


# Cumsum

In the event of using future rewards. Sequentially start summing closer and closer to the end

In [4]:
discount = 0.995
m_rewards = [1 for i in range(10)]

# Shaping rewards

1. Isolate the future rewards (created by cumsum)
2. Create a new array so we can sequentially discount the Rewards_future, which will be come 10! ? because at each step, we start again with the discounting. 
3. The future rewards array reflects all the future rewards for the current trajectory.

## For single reward trajectories

In [50]:
# This is what makes sense to me
rewards = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
discounts = discount**np.arange(len(rewards))
print('discounts',discounts)
# m_cum_rewards = np.asarray(m_rewards).cumsum(axis=0)[::-1]
# print('m_rewards',m_rewards)
# print('m_cum_rewards',m_cum_rewards)
# discounted_rewards = np.asarray(m_cum_rewards)*discounts
# print('discounted_rewards',discounted_rewards)

# a_rewards = np.array(rewards)
future_r = [rewards[i:]*discounts[:-i] if i>0 else rewards*discounts for i in range(len(rewards))]
print('future_r',future_r)

rewards_future = [sum(future_r[i]) for i in range(len(future_r))]
print('rewards_future',rewards_future)

mean = np.mean(rewards_future)
std = np.std(rewards_future) + 1.0e-10

rewards_normalized = (rewards_future - mean)/std
print('rewards_normalized',rewards_normalized)

discounts [1.         0.995      0.990025   0.98507488 0.9801495  0.97524875
 0.97037251 0.96552065 0.96069304 0.95588958 0.95111013 0.94635458]
future_r [array([1.        , 0.995     , 0.990025  , 0.98507488, 0.9801495 ,
       0.97524875, 0.97037251, 0.96552065, 0.96069304, 0.95588958,
       0.95111013, 0.94635458]), array([1.        , 0.995     , 0.990025  , 0.98507488, 0.9801495 ,
       0.97524875, 0.97037251, 0.96552065, 0.96069304, 0.95588958,
       0.95111013]), array([1.        , 0.995     , 0.990025  , 0.98507488, 0.9801495 ,
       0.97524875, 0.97037251, 0.96552065, 0.96069304, 0.95588958]), array([1.        , 0.995     , 0.990025  , 0.98507488, 0.9801495 ,
       0.97524875, 0.97037251, 0.96552065, 0.96069304]), array([1.        , 0.995     , 0.990025  , 0.98507488, 0.9801495 ,
       0.97524875, 0.97037251, 0.96552065]), array([1.        , 0.995     , 0.990025  , 0.98507488, 0.9801495 ,
       0.97524875, 0.97037251]), array([1.        , 0.995     , 0.990025  , 0.985074

## For multiple trajectories with N agents

This seems slightly off, as for individual agents the final reward should be 1 not .95

In [40]:
rewards = np.full((10,10),1)
print(rewards)
discounted = discount**np.arange(rewards.shape[0])
print(discounted)
discounted_rewards = np.asarray(rewards)*discounted[:,np.newaxis]
print('discounted_rewards',discounted_rewards)
# convert rewards to future rewards
future_rewards = discounted_rewards[::-1].cumsum(axis=0)[::-1]

mean = np.mean(future_rewards, axis=1)
std = np.std(future_rewards, axis=1) + 1.0e-10

rewards_normalized = (future_rewards - mean[:,np.newaxis])/std[:,np.newaxis]
print('future_rewards',future_rewards)
print('rewards_normalized',rewards_normalized)

[[1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1]]
[1.         0.995      0.990025   0.98507488 0.9801495  0.97524875
 0.97037251 0.96552065 0.96069304 0.95588958]
discounted_rewards [[1.         1.         1.         1.         1.         1.
  1.         1.         1.         1.        ]
 [0.995      0.995      0.995      0.995      0.995      0.995
  0.995      0.995      0.995      0.995     ]
 [0.990025   0.990025   0.990025   0.990025   0.990025   0.990025
  0.990025   0.990025   0.990025   0.990025  ]
 [0.98507488 0.98507488 0.98507488 0.98507488 0.98507488 0.98507488
  0.98507488 0.98507488 0.98507488 0.98507488]
 [0.9801495  0.9801495  0.9801495  0.9801495  0.9801495  0.9801495
  0.9801495  0.9801495  0.9801495  0.9801495 ]
 [0.97524875 0.97524875 0.97524875 0.97524875 0.97524875 0.97524875
  0.975248

In [37]:
a = np.arange(10)
b = a[::-1].cumsum(axis=0)[::-1]
print(a)
print(b)

[0 1 2 3 4 5 6 7 8 9]
[45 45 44 42 39 35 30 24 17  9]


In [18]:

# rewards = [1 for i in range(10)]
rewards = list(range(10))

discounted = discount**np.arange(len(rewards))
discounted_rewards = np.asarray(rewards)*discounted[:,np.newaxis]
print('discounted',discounted)
print('np.asarray(rewards)',np.asarray(rewards))
print('discounted[:,np.newaxis]',discounted[:,np.newaxis])
print('discounted_rewards.shape',discounted_rewards.shape)
print('discounted_rewards',discounted_rewards)

discounted [ 1.          0.995       0.990025    0.98507488  0.9801495   0.97524875
  0.97037251  0.96552065  0.96069304  0.95588958]
np.asarray(rewards) [0 1 2 3 4 5 6 7 8 9]
discounted[:,np.newaxis] [[ 1.        ]
 [ 0.995     ]
 [ 0.990025  ]
 [ 0.98507488]
 [ 0.9801495 ]
 [ 0.97524875]
 [ 0.97037251]
 [ 0.96552065]
 [ 0.96069304]
 [ 0.95588958]]
discounted_rewards.shape (10, 10)
discounted_rewards [[ 0.          1.          2.          3.          4.          5.          6.
   7.          8.          9.        ]
 [ 0.          0.995       1.99        2.985       3.98        4.975       5.97
   6.965       7.96        8.955     ]
 [ 0.          0.990025    1.98005     2.970075    3.9601      4.950125
   5.94015     6.930175    7.9202      8.910225  ]
 [ 0.          0.98507488  1.97014975  2.95522463  3.9402995   4.92537438
   5.91044925  6.89552412  7.880599    8.86567388]
 [ 0.          0.9801495   1.960299    2.9404485   3.920598    4.9007475
   5.880897    6.8610465   7.841196   

In [10]:
# convert rewards to future rewards
future_rewards = discounted_rewards[::-1].cumsum(axis=0)[::-1]

mean = np.mean(future_rewards, axis=1)
std = np.std(future_rewards, axis=1) + 1.0e-10

rewards_normalized = (future_rewards - mean[:,np.newaxis])/std[:,np.newaxis]

In [11]:
print(future_rewards)
print(rewards_normalized)

[[  0.           9.77797391  19.55594781  29.33392172  39.11189563
   48.88986953  58.66784344  68.44581735  78.22379125  88.00176516]
 [  0.           8.77797391  17.55594781  26.33392172  35.11189563
   43.88986953  52.66784344  61.44581735  70.22379125  79.00176516]
 [  0.           7.78297391  15.56594781  23.34892172  31.13189563
   38.91486953  46.69784344  54.48081735  62.26379125  70.04676516]
 [  0.           6.79294891  13.58589781  20.37884672  27.17179563
   33.96474453  40.75769344  47.55064235  54.34359125  61.13654016]
 [  0.           5.80787403  11.61574806  17.4236221   23.23149613
   29.03937016  34.84724419  40.65511822  46.46299225  52.27086629]
 [  0.           4.82772453   9.65544906  14.48317359  19.31089812
   24.13862266  28.96634719  33.79407172  38.62179625  43.44952078]
 [  0.           3.85247578   7.70495156  11.55742733  15.40990311
   19.26237889  23.11485467  26.96733045  30.81980622  34.672282  ]
 [  0.           2.88210327   5.76420654   8.64630981  

In [27]:
mean = np.mean(future_rewards,axis=1)
std = np.std(future_rewards,axis=1) + 1e-10
rewards_normalized = (future_rewards - mean[:,np.newaxis])/std[:,np.newaxis]

In [32]:
print(mean.shape)
print(std.shape)
print(rewards_normalized.shape)
print(rewards_normalized)

(10,)
(10,)
(10, 10)
[[-1.5666989  -1.21854359 -0.87038828 -0.52223297 -0.17407766  0.17407766
   0.52223297  0.87038828  1.21854359  1.5666989 ]
 [-1.5666989  -1.21854359 -0.87038828 -0.52223297 -0.17407766  0.17407766
   0.52223297  0.87038828  1.21854359  1.5666989 ]
 [-1.5666989  -1.21854359 -0.87038828 -0.52223297 -0.17407766  0.17407766
   0.52223297  0.87038828  1.21854359  1.5666989 ]
 [-1.5666989  -1.21854359 -0.87038828 -0.52223297 -0.17407766  0.17407766
   0.52223297  0.87038828  1.21854359  1.5666989 ]
 [-1.5666989  -1.21854359 -0.87038828 -0.52223297 -0.17407766  0.17407766
   0.52223297  0.87038828  1.21854359  1.5666989 ]
 [-1.5666989  -1.21854359 -0.87038828 -0.52223297 -0.17407766  0.17407766
   0.52223297  0.87038828  1.21854359  1.5666989 ]
 [-1.5666989  -1.21854359 -0.87038828 -0.52223297 -0.17407766  0.17407766
   0.52223297  0.87038828  1.21854359  1.5666989 ]
 [-1.5666989  -1.21854359 -0.87038828 -0.52223297 -0.17407766  0.17407766
   0.52223297  0.87038828  1.2

# Newaxis

Creates a new axis to fill out.
*For future rewards*

In [15]:
rewards = list(range(10))
discount = np.full(10,0.995)
print(discount)
print(rewards)
discounted_rewards = np.asarray(rewards)*discount[:,np.newaxis]
print(discounted_rewards)

[0.995 0.995 0.995 0.995 0.995 0.995 0.995 0.995 0.995 0.995]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[[0.    0.995 1.99  2.985 3.98  4.975 5.97  6.965 7.96  8.955]
 [0.    0.995 1.99  2.985 3.98  4.975 5.97  6.965 7.96  8.955]
 [0.    0.995 1.99  2.985 3.98  4.975 5.97  6.965 7.96  8.955]
 [0.    0.995 1.99  2.985 3.98  4.975 5.97  6.965 7.96  8.955]
 [0.    0.995 1.99  2.985 3.98  4.975 5.97  6.965 7.96  8.955]
 [0.    0.995 1.99  2.985 3.98  4.975 5.97  6.965 7.96  8.955]
 [0.    0.995 1.99  2.985 3.98  4.975 5.97  6.965 7.96  8.955]
 [0.    0.995 1.99  2.985 3.98  4.975 5.97  6.965 7.96  8.955]
 [0.    0.995 1.99  2.985 3.98  4.975 5.97  6.965 7.96  8.955]
 [0.    0.995 1.99  2.985 3.98  4.975 5.97  6.965 7.96  8.955]]
