In [3]:
import numpy as np

# Define the grid world environment
GRID_SIZE = 5
REWARD_GOAL = 10
REWARD_STEP = -1
DISCOUNT_FACTOR = 0.9
THRESHOLD = 1e-4  # Convergence threshold

# Actions: up, down, left, right
actions = [(-1, 0), (1, 0), (0, -1), (0, 1)]
action_names = ['U', 'D', 'L', 'R']

def is_valid(state):
    """Check if the state is within the grid boundaries."""
    x, y = state
    return 0 <= x < GRID_SIZE and 0 <= y < GRID_SIZE

# Initialize rewards and value function
rewards = np.full((GRID_SIZE, GRID_SIZE), REWARD_STEP)
rewards[GRID_SIZE - 1, GRID_SIZE - 1] = REWARD_GOAL  # Goal state

values = np.zeros((GRID_SIZE, GRID_SIZE))  # Value function
policy = np.full((GRID_SIZE, GRID_SIZE), '', dtype='<U1')  # Optimal policy

def value_iteration():
    """Perform value iteration to find the optimal policy."""
    global values, policy

    while True:
        delta = 0  # To track convergence
        new_values = np.copy(values)

        for x in range(GRID_SIZE):
            for y in range(GRID_SIZE):
                if (x, y) == (GRID_SIZE - 1, GRID_SIZE - 1):
                    # Skip the goal state
                    continue

                best_value = float('-inf')
                best_action = ''

                for action, name in zip(actions, action_names):
                    next_state = (x + action[0], y + action[1])

                    if is_valid(next_state):
                        nx, ny = next_state
                        reward = rewards[nx, ny]
                        value = reward + DISCOUNT_FACTOR * values[nx, ny]

                        if value > best_value:
                            best_value = value
                            best_action = name

                new_values[x, y] = best_value
                policy[x, y] = best_action

                delta = max(delta, abs(new_values[x, y] - values[x, y]))

        values = new_values

        if delta < THRESHOLD:
            break

def print_policy():
    """Print the optimal policy for each state."""
    for row in policy:
        print(' '.join(row))

# Run value iteration and print the results
value_iteration()
print("Optimal Policy:")
print_policy()

Optimal Policy:
D D D D D
D D D D D
D D D D D
D D D D D
R R R R 
