A) Utility: compute discounted returns (Monte Carlo-style)

In [1]:
import numpy as np

def discounted_return(rewards, gamma=0.99):
    """
    rewards: list/array of rewards r_t, r_{t+1}, ...
    returns G_0 for the whole trajectory.
    """
    G = 0.0
    power = 1.0
    for r in rewards:
        G += power * r
        power *= gamma
    return G

In [2]:
def discounted_returns_per_timestep(rewards, gamma=0.99):
    """
    For a single episode's rewards, compute G_t for every t (from the end).
    Useful for Monte Carlo policy evaluation.
    """
    G = 0.0
    out = []
    for r in reversed(rewards):
        G = r + gamma * G
        out.append(G)
    return list(reversed(out))

# demo
if __name__ == "__main__":
    traj_rewards = [0, 0, 1, 0, 2]
    print("G0:", discounted_return(traj_rewards, gamma=0.9))
    print("G_t:", discounted_returns_per_timestep(traj_rewards, gamma=0.9))


G0: 2.1222000000000003
G_t: [2.1222000000000003, 2.358, 2.62, 1.8, 2.0]


B) TD(0) policy evaluation (shows γ inside Bellman update)

In [5]:
import numpy as np
from collections import defaultdict

def td0_policy_evaluation(transitions, policy, gamma=0.99, alpha=0.1, episodes=5000):
    """
    transitions[s][a] -> list of (prob, next_s, reward, done)
    policy(s) -> action (deterministic for simplicity)
    Returns state-value function V.
    """
    V = defaultdict(float)
    states = list(transitions.keys())

    rng = np.random.default_rng(0)
    for _ in range(episodes):
        # start from a random non-terminal state
        s = rng.choice(states)
        # simple rollout until termination or length cap
        for _ in range(100):
            a = policy(s)
            # sample next transition according to dynamics
            probs, next_states, rewards, dones = zip(*transitions[s][a])
            idx = rng.choice(len(probs), p=np.array(probs))
            s_next, r, done = next_states[idx], rewards[idx], dones[idx]

            # TD(0) update: V(s) <- V(s) + α [ r + γ V(s') - V(s) ]
            target = r + (0 if done else gamma * V[s_next])
            V[s] += alpha * (target - V[s])

            s = s_next
            if done: break
    return V


C) Q-learning on a tiny chain (see how γ shapes behavior)

Environment: states 0..4 on a line. 0 and 4 are terminal. Reward +1 at state 4, −1 at state 0, 0 otherwise. Actions: left/right.

In [6]:
import numpy as np

class ChainEnv:
    def __init__(self, n=5):
        self.n = n
        self.start = 2  # middle
        self.reset()

    def reset(self):
        self.s = self.start
        return self.s

    def step(self, a):
        # a: 0=left, 1=right
        if self.s == 0:
            return 0, 0.0, True, {}
        if self.s == self.n - 1:
            return self.n - 1, 0.0, True, {}

        self.s = max(0, min(self.n - 1, self.s + (1 if a == 1 else -1)))

        done = (self.s == 0) or (self.s == self.n - 1)
        reward = 0.0
        if self.s == self.n - 1:
            reward = 1.0
        elif self.s == 0:
            reward = -1.0
        return self.s, reward, done, {}

def q_learning_chain(gamma=0.9, alpha=0.1, eps=0.1, episodes=2000, n=5, seed=0):
    rng = np.random.default_rng(seed)
    env = ChainEnv(n=n)
    Q = np.zeros((n, 2))  # actions: left(0), right(1)

    def eps_greedy(s):
        if rng.random() < eps:
            return rng.integers(2)
        return int(np.argmax(Q[s]))

    for _ in range(episodes):
        s = env.reset()
        for _ in range(50):
            a = eps_greedy(s)
            s2, r, done, _ = env.step(a)

            # Q-learning update:
            # Q(s,a) ← Q(s,a) + α [ r + γ max_a' Q(s',a') − Q(s,a) ]
            best_next = 0.0 if done else np.max(Q[s2])
            td_target = r + gamma * best_next
            Q[s, a] += alpha * (td_target - Q[s, a])

            s = s2
            if done: break

    V = np.max(Q, axis=1)
    Pi = np.argmax(Q, axis=1)
    return Q, V, Pi

if __name__ == "__main__":
    for g in [0.5, 0.9, 0.99]:
        Q, V, Pi = q_learning_chain(gamma=g)
        print(f"\nGamma={g}")
        print("State values:", np.round(V, 3))
        print("Greedy policy (0=left,1=right):", Pi)



Gamma=0.5
State values: [0.   0.25 0.5  1.   0.  ]
Greedy policy (0=left,1=right): [0 1 1 1 0]

Gamma=0.9
State values: [0.   0.81 0.9  1.   0.  ]
Greedy policy (0=left,1=right): [0 1 1 1 0]

Gamma=0.99
State values: [0.   0.98 0.99 1.   0.  ]
Greedy policy (0=left,1=right): [0 1 1 1 0]
