<a href="https://colab.research.google.com/github/2303a51015/AIML-2025/blob/main/RL_LAB2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import gymnasium as gym
import numpy as np


env = gym.make("FrozenLake-v1", is_slippery=False)

unwrapped_env = env.unwrapped
n_states = unwrapped_env.observation_space.n
n_actions = unwrapped_env.action_space.n


gamma = 0.99
theta = 1e-8

def value_iteration(env):
    """
    Performs Value Iteration to find the optimal value function and policy.
    """
    V = np.zeros(n_states)
    while True:
        delta = 0
        for s in range(n_states):
            q_values = []
            for a in range(n_actions):
                q = 0

                for prob, next_state, reward, done in unwrapped_env.P[s][a]:
                    q += prob * (reward + gamma * V[next_state])
                q_values.append(q)

            max_q = max(q_values)
            delta = max(delta, abs(V[s] - max_q))
            V[s] = max_q


        if delta < theta:
            break


    policy = np.zeros(n_states, dtype=int)
    for s in range(n_states):
        q_values = []
        for a in range(n_actions):
            q = 0
            for prob, next_state, reward, done in unwrapped_env.P[s][a]:
                q += prob * (reward + gamma * V[next_state])
            q_values.append(q)
        policy[s] = np.argmax(q_values)

    return policy, V

def policy_iteration(env):
    """
    Performs Policy Iteration to find the optimal policy and value function.
    """
    policy = np.zeros(n_states, dtype=int)
    V = np.zeros(n_states)

    while True:

        while True:
            delta = 0
            for s in range(n_states):
                v = 0
                a = policy[s]

                for prob, next_state, reward, done in unwrapped_env.P[s][a]:
                    v += prob * (reward + gamma * V[next_state])
                delta = max(delta, abs(V[s] - v))
                V[s] = v
            if delta < theta:
                break


        policy_stable = True
        for s in range(n_states):
            old_action = policy[s]


            q_values = []
            for a in range(n_actions):
                q = 0
                for prob, next_state, reward, done in unwrapped_env.P[s][a]:
                    q += prob * (reward + gamma * V[next_state])
                q_values.append(q)

            new_action = np.argmax(q_values)
            policy[s] = new_action


            if old_action != new_action:
                policy_stable = False


        if policy_stable:
            break

    return policy, V

def run_policy(env, policy):
    """
    Runs one episode in the environment using the given policy.
    """
    total_rewards = 0
    state, _ = env.reset()
    while True:
        action = policy[state]
        state, reward, terminated, truncated, _ = env.step(action)
        total_rewards += reward
        if terminated or truncated:
            break
    return total_rewards


if __name__ == "__main__":
    print("--- Running Value Iteration ---")
    vi_policy, vi_V = value_iteration(unwrapped_env)
    vi_rewards = [run_policy(env, vi_policy) for _ in range(100)]
    print(f"Optimal Policy (Value Iteration): {vi_policy}")
    print(f"Average Reward over 100 episodes: {np.mean(vi_rewards)}\n")

    print("--- Running Policy Iteration ---")
    pi_policy, pi_V = policy_iteration(unwrapped_env)
    pi_rewards = [run_policy(env, pi_policy) for _ in range(100)]
    print(f"Optimal Policy (Policy Iteration): {pi_policy}")
    print(f"Average Reward over 100 episodes: {np.mean(pi_rewards)}")

    env.close()

--- Running Value Iteration ---
Optimal Policy (Value Iteration): [1 2 1 0 1 0 1 0 2 1 1 0 0 2 2 0]
Average Reward over 100 episodes: 1.0

--- Running Policy Iteration ---
Optimal Policy (Policy Iteration): [1 2 1 0 1 0 1 0 2 1 1 0 0 2 2 0]
Average Reward over 100 episodes: 1.0
