In [6]:
import numpy as np

class GridWorld:
    def __init__(self):
        self.rows = 4
        self.cols = 4
        self.state = (0, 0)
        self.goal = (3, 3)
        self.actions = [0, 1, 2, 3] # Up, Right, Down, Left

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 0:   x = max(0, x - 1)
        elif action == 1: y = min(self.cols - 1, y + 1)
        elif action == 2: x = min(self.rows - 1, x + 1)
        elif action == 3: y = max(0, y - 1)
        
        self.state = (x, y)
        if self.state == self.goal: return self.state, 10, True
        else: return self.state, -1, False

def run_rollout():
    env = GridWorld()
    V = np.zeros((4, 4))
    
    # Pre-calculate Value Table
    for _ in range(50):
        for r in range(4):
            for c in range(4):
                if (r,c) == env.goal: continue
                vals = []
                for a in env.actions:
                    env.state = (r,c)
                    ns, rew, _ = env.step(a)
                    vals.append(rew + 0.9 * V[ns])
                V[r,c] = max(vals) if vals else 0

    print("Optimal Value Table (Pre-calculated):\n", np.round(V, 2))
    
    # Rollout
    print("\nRolling out from (0,0)...")
    state = env.reset()
    path = [state]
    done = False
    
    # Added a safety break to prevent infinite loops if something goes wrong
    steps = 0
    max_steps = 20 

    while not done and steps < max_steps:
        best_action = -1
        best_val = -float('inf')
        
        # Greedy Selection
        for action in env.actions:
            env.state = state # Reset environment to current state to test action
            next_s, _, _ = env.step(action) # This moves env.state!
            if V[next_s] > best_val:
                best_val = V[next_s]
                best_action = action
        
        # Reset environment state after evaluating actions
        env.state = state 
        # -------------------

        state, _, done = env.step(best_action)
        path.append(state)
        steps += 1
        
    print("Path taken:", path)

if __name__ == "__main__":
    run_rollout()

Optimal Value Table (Pre-calculated):
 [[ 1.81  3.12  4.58  6.2 ]
 [ 3.12  4.58  6.2   8.  ]
 [ 4.58  6.2   8.   10.  ]
 [ 6.2   8.   10.    0.  ]]

Rolling out from (0,0)...
Path taken: [(0, 0), (0, 1), (0, 2), (0, 3), (1, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3)]


In [1]:
# exp3_rollout_optimal_policy.py

import numpy as np

class GridWorld4x4:
    def __init__(self, start_state=0, goal_state=15, max_steps=100):
        self.n_rows = 4
        self.n_cols = 4
        self.n_states = self.n_rows * self.n_cols
        self.n_actions = 4
        self.start_state = start_state
        self.goal_state = goal_state
        self.max_steps = max_steps
        self.state = start_state
        self.steps = 0

    def state_to_xy(self, s):
        return (s // self.n_cols, s % self.n_cols)

    def xy_to_state(self, r, c):
        return r * self.n_cols + c

    def reset(self, start_state=None):
        if start_state is not None:
            self.start_state = start_state
        self.state = self.start_state
        self.steps = 0
        return self.state

    def step(self, action):
        r, c = self.state_to_xy(self.state)
        if action == 0:
            r = max(0, r - 1)
        elif action == 1:
            c = min(self.n_cols - 1, c + 1)
        elif action == 2:
            r = min(self.n_rows - 1, r + 1)
        elif action == 3:
            c = max(0, c - 1)
        ns = self.xy_to_state(r, c)
        self.state = ns
        self.steps += 1
        done = (ns == self.goal_state) or (self.steps >= self.max_steps)
        reward = 0 if ns == self.goal_state else -1
        return ns, reward, done, {}

    def render(self):
        grid = np.full((self.n_rows, self.n_cols), '.')
        r, c = self.state_to_xy(self.state)
        gr, gc = self.state_to_xy(self.goal_state)
        grid[gr, gc] = 'G'
        grid[r, c] = 'A'
        print('\n'.join(' '.join(row) for row in grid))
        print()

def value_iteration(env, gamma=1.0, theta=1e-6):
    V = np.zeros(env.n_states)
    while True:
        delta = 0.0
        for s in range(env.n_states):
            if s == env.goal_state:
                continue
            v_old = V[s]
            q_vals = []
            for a in range(env.n_actions):
                # simulate one step from state s
                r0, c0 = env.state_to_xy(s)
                if a == 0:
                    r = max(0, r0 - 1)
                    c = c0
                elif a == 1:
                    r = r0
                    c = min(env.n_cols - 1, c0 + 1)
                elif a == 2:
                    r = min(env.n_rows - 1, r0 + 1)
                    c = c0
                else:
                    r = r0
                    c = max(0, c0 - 1)
                ns = env.xy_to_state(r, c)
                reward = 0 if ns == env.goal_state else -1
                q_vals.append(reward + gamma * V[ns])
            V[s] = max(q_vals)
            delta = max(delta, abs(v_old - V[s]))
        if delta < theta:
            break

    policy = np.zeros(env.n_states, dtype=int)
    for s in range(env.n_states):
        if s == env.goal_state:
            continue
        q_vals = []
        for a in range(env.n_actions):
            r0, c0 = env.state_to_xy(s)
            if a == 0:
                r = max(0, r0 - 1)
                c = c0
            elif a == 1:
                r = r0
                c = min(env.n_cols - 1, c0 + 1)
            elif a == 2:
                r = min(env.n_rows - 1, r0 + 1)
                c = c0
            else:
                r = r0
                c = max(0, c0 - 1)
            ns = env.xy_to_state(r, c)
            reward = 0 if ns == env.goal_state else -1
            q_vals.append(reward + gamma * V[ns])
        policy[s] = int(np.argmax(q_vals))
    return V, policy

def rollout(env, policy, start_state=0, max_steps=50):
    s = env.reset(start_state)
    traj = [s]
    acts = []
    rewards = []
    done = False
    while not done and len(traj) < max_steps:
        a = policy[s]
        s, r, done, _ = env.step(a)
        traj.append(s)
        acts.append(a)
        rewards.append(r)
    return traj, acts, rewards

if __name__ == "__main__":
    env = GridWorld4x4()
    V, pi = value_iteration(env)
    traj, acts, rewards = rollout(env, pi, start_state=0)
    print("Optimal policy rollout from (0,0):")
    print("States:", traj)
    print("Actions:", acts)
    print("Rewards:", rewards)


Optimal policy rollout from (0,0):
States: [0, 1, 2, 3, 7, 11, 15]
Actions: [np.int64(1), np.int64(1), np.int64(1), np.int64(2), np.int64(2), np.int64(2)]
Rewards: [-1, -1, -1, -1, -1, 0]
