A) Utility: compute discounted returns (Monte Carlo-style)

In [1]:
import numpy as np

def discounted_return(rewards, gamma=0.99):
    """
    rewards: list/array of rewards r_t, r_{t+1}, ...
    returns G_0 for the whole trajectory.
    """
    G = 0.0
    power = 1.0
    for r in rewards:
        G += power * r
        power *= gamma
    return G

In [2]:
def discounted_returns_per_timestep(rewards, gamma=0.99):
    """
    For a single episode's rewards, compute G_t for every t (from the end).
    Useful for Monte Carlo policy evaluation.
    """
    G = 0.0
    out = []
    for r in reversed(rewards):
        G = r + gamma * G
        out.append(G)
    return list(reversed(out))

# demo
if __name__ == "__main__":
    traj_rewards = [0, 0, 1, 0, 2]
    print("G0:", discounted_return(traj_rewards, gamma=0.9))
    print("G_t:", discounted_returns_per_timestep(traj_rewards, gamma=0.9))


G0: 2.1222000000000003
G_t: [2.1222000000000003, 2.358, 2.62, 1.8, 2.0]


B) TD(0) policy evaluation (shows γ inside Bellman update)

In [5]:
import numpy as np
from collections import defaultdict

def td0_policy_evaluation(transitions, policy, gamma=0.99, alpha=0.1, episodes=5000):
    """
    transitions[s][a] -> list of (prob, next_s, reward, done)
    policy(s) -> action (deterministic for simplicity)
    Returns state-value function V.
    """
    V = defaultdict(float)
    states = list(transitions.keys())

    rng = np.random.default_rng(0)
    for _ in range(episodes):
        # start from a random non-terminal state
        s = rng.choice(states)
        # simple rollout until termination or length cap
        for _ in range(100):
            a = policy(s)
            # sample next transition according to dynamics
            probs, next_states, rewards, dones = zip(*transitions[s][a])
            idx = rng.choice(len(probs), p=np.array(probs))
            s_next, r, done = next_states[idx], rewards[idx], dones[idx]

            # TD(0) update: V(s) <- V(s) + α [ r + γ V(s') - V(s) ]
            target = r + (0 if done else gamma * V[s_next])
            V[s] += alpha * (target - V[s])

            s = s_next
            if done: break
    return V
