In [None]:
import numpy as np

class PolicyIteration:
    def __init__(self, n, rewards, gamma=0.9, theta=1e-4):
        self.n = n
        self.rewards = rewards
        self.gamma = gamma
        self.theta = theta
        self.grid = np.zeros((n, n))
        self.policy = np.zeros((n, n, 2), dtype=int)
    
    def get_possible_actions(self):
        return [(0, 1), (1, 0), (0, -1), (-1, 0)]  # Right, Down, Left, Up

    def is_valid(self, x, y):
        return 0 <= x < self.n and 0 <= y < self.n

    def policy_evaluation(self):
        while True:
            delta = 0
            new_grid = np.copy(self.grid)
            for i in range(self.n):
                for j in range(self.n):
                    action = tuple(self.policy[i, j])
                    ni, nj = i + action[0], j + action[1]
                    if self.is_valid(ni, nj):
                        new_grid[i, j] = self.rewards[i, j] + self.gamma * self.grid[ni, nj]
                    delta = max(delta, abs(new_grid[i, j] - self.grid[i, j]))
            self.grid = new_grid
            if delta < self.theta:
                break

    def policy_improvement(self):
        policy_stable = True
        for i in range(self.n):
            for j in range(self.n):
                old_action = tuple(self.policy[i, j])
                values = []
                for action in self.get_possible_actions():
                    ni, nj = i + action[0], j + action[1]
                    if self.is_valid(ni, nj):
                        value = self.rewards[i, j] + self.gamma * self.grid[ni, nj]
                        values.append((value, action))
                best_value, best_action = max(values)
                if best_action != old_action:
                    policy_stable = False
                self.policy[i, j] = best_action
        return policy_stable

    def policy_iteration(self):
        while True:
            self.policy_evaluation()
            if self.policy_improvement():
                break

    def print_results(self):
        print("Optimal Value Function:")
        print(np.round(self.grid, 2))
        print("Optimal Policy:")
        direction_map = {(0, 1): '→', (1, 0): '↓', (0, -1): '←', (-1, 0): '↑'}
        for i in range(self.n):
            for j in range(self.n):
                action = tuple(self.policy[i, j])
                print(direction_map.get(action, '.'), end=' ')
            print()

if __name__ == "__main__":
    n = 5
    rewards = np.zeros((n, n))
    rewards[1, 4] = 1  # Goal reward
    rewards[3, 4] = -1 # Negative reward
    pi = PolicyIteration(n, rewards)
    pi.policy_iteration()
    pi.print_results()
