#### Problem 3: 5x5 Gridworld

#### Changes in Value Iteration Agent

In [17]:
import numpy as np
from gridworld import GridWorld
from value_iteration_agent import Agent

In [18]:
class Agent():
    def __init__(self, env, theta_threshold=0.01):
        # Initialize the agent with the environment and parameters
        self.env_size = env.get_size()
        self.env = env
        # Value function initialization
        self.V = np.zeros((self.env_size, self.env_size))  
        # Define the terminal state
        self.terminal_state = (4, 4)
        # Value of the terminal state is zero  
        self.V[self.terminal_state] = 0  
        # Threshold for convergence
        self.theta_threshold = theta_threshold 
        # Get the list of possible actions 
        self.actions = env.get_actions() 
        # Discount factor 
        self.gamma = 1.0  
        # Initialize policy
        self.pi_greedy = np.zeros((self.env_size, self.env_size), dtype=int)  

    def calculate_max_value(self, i, j):
        # Calculate the maximum value and best action for state (i, j)
        max_value = float('-inf')
        best_action = None
        best_actions_str = ""
        for action_index in range(len(self.actions)):
            next_i, next_j, reward, _ = self.env.step(action_index, i, j)
            if self.env.is_valid_state(next_i, next_j):
                value = self.get_value(next_i, next_j, reward)
                if value >= max_value:
                    if value > max_value:
                        best_actions_str = self.env.action_description[action_index]
                    else:
                        best_actions_str += "|" + self.env.action_description[action_index]

                    best_action = action_index
                    max_value = value
        return max_value, best_action, best_actions_str

    def get_value(self, i, j, reward):
        # Calculate the value of a state given a reward
        return reward + self.gamma * self.V[i, j]

    def update_value_function(self, V):
        # Update the value function
        self.V = np.copy(V)

    def get_value_function(self):
        # Return the current value function
        return self.V

    def update_greedy_policy(self):
        # Update the policy to be greedy with respect to the value function
        self.pi_str = []
        for i in range(self.env_size):
            pi_row = []
            for j in range(self.env_size):
                if self.env.is_terminal_state(i,j):
                    pi_row.append("X")  # Mark terminal state in the policy
                    continue
                _, self.pi_greedy[i,j], action_str = self.calculate_max_value(i, j)
                pi_row.append(action_str)
            self.pi_str.append(pi_row)

    def is_done(self, new_V):
        # Check if the value function has converged
        delta = abs(self.V - new_V)
        max_delta = delta.max()
        return max_delta <= self.theta_threshold

    def get_policy(self):
        # Return the greedy policy
        return self.pi_greedy

    def print_policy(self):
        # Print the policy
        for row in self.pi_str:
            print(row)

#### Changes in Value Iteration Solved

In [19]:
def main():
    ENV_SIZE = 5
    THETA_THRESHOLD = 0.05
    MAX_ITERATIONS = 1000
    # Initialize the environment and agent
    env = GridWorld(ENV_SIZE)
    agent = Agent(env, THETA_THRESHOLD)
    # Perform value iteration
    done = False
    for iter in range(MAX_ITERATIONS):
        if done: 
            break
        # Copy the current value function to new_V
        new_V = np.copy(agent.get_value_function())
        for i in range(ENV_SIZE):
            for j in range(ENV_SIZE):
                if not env.is_terminal_state(i, j):
                    # Calculate the maximum value for each state
                    new_V[i, j], _, _ = agent.calculate_max_value(i, j)
        # Check if the value function has converged
        done = agent.is_done(new_V)
        # Update the agent's value function
        agent.update_value_function(new_V)
    # optimal value function
    print("Optimal Value Function Found in %d iterations:" % (iter + 1))
    print(agent.get_value_function())
    # Update and print the greedy policy
    agent.update_greedy_policy()
    agent.print_policy()

if __name__ == "__main__":
    main()

Optimal Value Function Found in 9 iterations:
[[-7. -6. -5. -4. -3.]
 [-6. -5. -4. -3. -2.]
 [-5. -4. -3. -2. -1.]
 [-4. -3. -2. -1.  0.]
 [-3. -2. -1.  0.  0.]]
['Right|Down', 'Right|Down', 'Right|Down', 'Right|Down', 'Down']
['Right|Down', 'Right|Down', 'Right|Down', 'Right|Down', 'Down']
['Right|Down', 'Right|Down', 'Right|Down', 'Right|Down', 'Down']
['Right|Down', 'Right|Down', 'Right|Down', 'Right|Down', 'Down']
['Right', 'Right', 'Right', 'Right', 'X']


### Summary

**Value Iteration Agent**
*Updated and implemented methods for calculating the maximum value, checking convergence and updating the policy.*

**Value Iteration Solved**
*Main function to perform value iterartion using the agent in th place updates and convergence check.*


*Hence, these updtes implement the value iteration algorithm, including in-place updates and policy improvement to find the optimal state-value function and its policy for Grid-World environment.*