In [6]:
import numpy as np

class GridWorld:
    def __init__(self):
        self.rows = 4
        self.cols = 4
        self.state = (0, 0)
        self.goal = (3, 3)
        self.actions = [0, 1, 2, 3] # Up, Right, Down, Left

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 0:   x = max(0, x - 1)
        elif action == 1: y = min(self.cols - 1, y + 1)
        elif action == 2: x = min(self.rows - 1, x + 1)
        elif action == 3: y = max(0, y - 1)
        
        self.state = (x, y)
        if self.state == self.goal: return self.state, 10, True
        else: return self.state, -1, False

def run_value_iteration():
    env = GridWorld()
    V = np.zeros((4, 4))
    gamma = 0.9
    theta = 1e-4
    
    print("Running Value Iteration...")
    iteration = 0
    while True:
        delta = 0
        for i in range(4):
            for j in range(4):
                if (i, j) == env.goal: continue
                
                v = V[i, j]
                values = []
                # Check all possible actions (Model Based)
                for action in env.actions:
                    env.state = (i, j)
                    next_s, reward, _ = env.step(action)
                    values.append(reward + gamma * V[next_s])
                
                V[i, j] = max(values)
                delta = max(delta, abs(v - V[i, j]))
        
        iteration += 1
        if delta < theta: break
        
    print(f"Converged in {iteration} iterations.")
    print("Optimal Value Table:\n", np.round(V, 2))

if __name__ == "__main__":
    run_value_iteration()

Running Value Iteration...
Converged in 7 iterations.
Optimal Value Table:
 [[ 1.81  3.12  4.58  6.2 ]
 [ 3.12  4.58  6.2   8.  ]
 [ 4.58  6.2   8.   10.  ]
 [ 6.2   8.   10.    0.  ]]


In [1]:
# exp2_value_iteration.py

import numpy as np

class GridWorld4x4:
    def __init__(self, start_state=0, goal_state=15, max_steps=100):
        self.n_rows = 4
        self.n_cols = 4
        self.n_states = self.n_rows * self.n_cols
        self.n_actions = 4
        self.start_state = start_state
        self.goal_state = goal_state
        self.max_steps = max_steps
        self.state = start_state

    def state_to_xy(self, s):
        return (s // self.n_cols, s % self.n_cols)

    def xy_to_state(self, r, c):
        return r * self.n_cols + c

    def step_from(self, s, action):
        r, c = self.state_to_xy(s)
        if action == 0:
            r = max(0, r - 1)
        elif action == 1:
            c = min(self.n_cols - 1, c + 1)
        elif action == 2:
            r = min(self.n_rows - 1, r + 1)
        elif action == 3:
            c = max(0, c - 1)
        ns = self.xy_to_state(r, c)
        reward = 0 if ns == self.goal_state else -1
        done = (ns == self.goal_state)
        return ns, reward, done

def value_iteration(env, gamma=1.0, theta=1e-6):
    V = np.zeros(env.n_states)
    while True:
        delta = 0.0
        for s in range(env.n_states):
            if s == env.goal_state:
                continue
            v_old = V[s]
            q_vals = []
            for a in range(env.n_actions):
                ns, r, done = env.step_from(s, a)
                q_vals.append(r + gamma * V[ns])
            V[s] = max(q_vals)
            delta = max(delta, abs(v_old - V[s]))
        if delta < theta:
            break

    policy = np.zeros(env.n_states, dtype=int)
    for s in range(env.n_states):
        if s == env.goal_state:
            continue
        q_vals = []
        for a in range(env.n_actions):
            ns, r, done = env.step_from(s, a)
            q_vals.append(r + gamma * V[ns])
        policy[s] = int(np.argmax(q_vals))
    return V, policy

if __name__ == "__main__":
    env = GridWorld4x4()
    V_opt, pi_opt = value_iteration(env, gamma=1.0, theta=1e-6)
    print("Optimal value function V*:")
    print(np.round(V_opt.reshape(4, 4), 2))
    print("\nOptimal policy (0:U,1:R,2:D,3:L):")
    print(pi_opt.reshape(4, 4))


Optimal value function V*:
[[-5. -4. -3. -2.]
 [-4. -3. -2. -1.]
 [-3. -2. -1.  0.]
 [-2. -1.  0.  0.]]

Optimal policy (0:U,1:R,2:D,3:L):
[[1 1 1 2]
 [1 1 1 2]
 [1 1 1 2]
 [1 1 1 0]]
