A) Utility: compute discounted returns (Monte Carlo-style)

In [1]:
import numpy as np

def discounted_return(rewards, gamma=0.99):
    """
    rewards: list/array of rewards r_t, r_{t+1}, ...
    returns G_0 for the whole trajectory.
    """
    G = 0.0
    power = 1.0
    for r in rewards:
        G += power * r
        power *= gamma
    return G

In [2]:
def discounted_returns_per_timestep(rewards, gamma=0.99):
    """
    For a single episode's rewards, compute G_t for every t (from the end).
    Useful for Monte Carlo policy evaluation.
    """
    G = 0.0
    out = []
    for r in reversed(rewards):
        G = r + gamma * G
        out.append(G)
    return list(reversed(out))

# demo
if __name__ == "__main__":
    traj_rewards = [0, 0, 1, 0, 2]
    print("G0:", discounted_return(traj_rewards, gamma=0.9))
    print("G_t:", discounted_returns_per_timestep(traj_rewards, gamma=0.9))


G0: 2.1222000000000003
G_t: [2.1222000000000003, 2.358, 2.62, 1.8, 2.0]
