In [5]:
import numpy as np

class ValueIteration:
    def __init__(self, n, rewards, gamma=0.9, theta=1e-4):
        self.n = n
        self.rewards = rewards
        self.gamma = gamma
        self.theta = theta
        self.grid = np.zeros((n, n))
        self.policy = np.zeros((n, n, 2), dtype=int)
    
    def get_possible_actions(self):
        return [(0, 1), (1, 0), (0, -1), (-1, 0)]  # Right, Down, Left, Up

    def is_valid(self, x, y):
        return 0 <= x < self.n and 0 <= y < self.n

    def value_iteration(self):
        while True:
            delta = 0
            new_grid = np.copy(self.grid)
            for i in range(self.n):
                for j in range(self.n):
                    values = []
                    for action in self.get_possible_actions():
                        ni, nj = i + action[0], j + action[1]
                        if self.is_valid(ni, nj):
                            values.append(self.rewards[i, j] + self.gamma * self.grid[ni, nj])
                    if values:
                        new_grid[i, j] = max(values)
                    delta = max(delta, abs(new_grid[i, j] - self.grid[i, j]))
            self.grid = new_grid
            if delta < self.theta:
                break
        self.update_policy()

    def update_policy(self):
        for i in range(self.n):
            for j in range(self.n):
                best_action = None
                best_value = -float('inf')
                for action in self.get_possible_actions():
                    ni, nj = i + action[0], j + action[1]
                    if self.is_valid(ni, nj):
                        value = self.rewards[i, j] + self.gamma * self.grid[ni, nj]
                        if value > best_value:
                            best_value = value
                            best_action = action
                if best_action:
                    self.policy[i, j] = best_action

    def print_results(self):
        print("Optimal Value Function:")
        print(np.round(self.grid, 2))
        print("Optimal Policy:")
        direction_map = {(0, 1): '→', (1, 0): '↓', (0, -1): '←', (-1, 0): '↑'}
        for i in range(self.n):
            for j in range(self.n):
                action = tuple(self.policy[i, j])
                print(direction_map.get(action, '.'), end=' ')
            print()

if __name__ == "__main__":
    n = 5
    rewards = np.zeros((n, n))
    rewards[1, 4] = 1  # Goal reward
    rewards[3, 4] = -1 # Negative reward
    vi = ValueIteration(n, rewards)
    vi.value_iteration()
    vi.print_results()


Optimal Value Function:
[[3.11 3.45 3.84 4.26 4.74]
 [3.45 3.84 4.26 4.74 5.26]
 [3.11 3.45 3.84 4.26 4.74]
 [2.8  3.11 3.45 3.84 3.26]
 [2.52 2.8  3.11 3.45 3.11]]
Optimal Policy:
→ → → → ↓ 
→ → → → ↓ 
→ → → → ↑ 
→ → → ↑ ↑ 
→ → → ↑ ← 
