In [1]:
import numpy as np
import random

class GridWorld:
    def __init__(self):
        self.rows = 4
        self.cols = 4
        self.state = (0, 0)
        self.goal = (3, 3)
        self.actions = [0, 1, 2, 3]

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 0:   x = max(0, x - 1)
        elif action == 1: y = min(self.cols - 1, y + 1)
        elif action == 2: x = min(self.rows - 1, x + 1)
        elif action == 3: y = max(0, y - 1)
        self.state = (x, y)
        if self.state == self.goal: return self.state, 10, True
        else: return self.state, -1, False

def run_td0():
    env = GridWorld()
    V = np.zeros((4, 4))
    alpha = 0.1
    gamma = 0.9
    
    print("Running TD(0)...")
    for _ in range(1000):
        state = env.reset()
        done = False
        while not done:
            action = random.choice(env.actions) # Random Policy for evaluation
            next_state, reward, done = env.step(action)
            
            # TD(0) Bootstrap Update
            target = reward + gamma * V[next_state] * (not done)
            V[state] += alpha * (target - V[state])
            
            state = next_state
            
    print("TD(0) Value Function:\n", np.round(V, 2))

if __name__ == "__main__":
    run_td0()

Running TD(0)...
TD(0) Value Function:
 [[-8.97 -8.69 -8.05 -7.86]
 [-8.81 -8.27 -7.09 -5.86]
 [-8.38 -7.58 -4.62 -0.06]
 [-7.96 -5.85 -2.15  0.  ]]


In [2]:
# exp5_td0_learning.py

import numpy as np
import random

class GridWorld4x4:
    def __init__(self, start_state=0, goal_state=15, max_steps=100):
        self.n_rows = 4
        self.n_cols = 4
        self.n_states = self.n_rows * self.n_cols
        self.n_actions = 4
        self.start_state = start_state
        self.goal_state = goal_state
        self.max_steps = max_steps

    def state_to_xy(self, s):
        return (s // self.n_cols, s % self.n_cols)

    def xy_to_state(self, r, c):
        return r * self.n_cols + c

    def reset(self):
        self.state = self.start_state
        self.steps = 0
        return self.state

    def step(self, action):
        r, c = self.state_to_xy(self.state)
        if action == 0:
            r = max(0, r - 1)
        elif action == 1:
            c = min(self.n_cols - 1, c + 1)
        elif action == 2:
            r = min(self.n_rows - 1, r + 1)
        elif action == 3:
            c = max(0, c - 1)
        ns = self.xy_to_state(r, c)
        self.state = ns
        self.steps += 1
        done = (ns == self.goal_state) or (self.steps >= self.max_steps)
        reward = 0 if ns == self.goal_state else -1
        return ns, reward, done, {}

def fixed_policy(env, s):
    # simple policy: prefer right (1) and down (2)
    r, c = env.state_to_xy(s)
    gr, gc = env.state_to_xy(env.goal_state)
    if c < gc:
        return 1  # right
    elif r < gr:
        return 2  # down
    else:
        return random.randrange(env.n_actions)

def td0_policy_eval(env, policy_fn, alpha=0.1, gamma=1.0, episodes=2000):
    V = np.zeros(env.n_states)
    for _ in range(episodes):
        s = env.reset()
        done = False
        while not done:
            a = policy_fn(env, s)
            ns, r, done, _ = env.step(a)
            V[s] = V[s] + alpha * (r + gamma * V[ns] - V[s])
            s = ns
    return V

if __name__ == "__main__":
    env = GridWorld4x4()
    V = td0_policy_eval(env, fixed_policy)
    print("TD(0) state-value function for fixed policy:")
    print(np.round(V.reshape(4, 4), 2))


TD(0) state-value function for fixed policy:
[[-5. -4. -3. -2.]
 [ 0.  0.  0. -1.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]
