In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
import gymnasium as gym
import numpy as np
import warnings
from gymnasium.envs.registration import register
from gymnasium.envs.toy_text.frozen_lake import generate_random_map

# Ignore unnecessary warnings (like environment plugins)
warnings.filterwarnings("ignore", category=UserWarning)


# 1. ENVIRONMENT SETUP

# Default 4x4 Frozen Lake (slippery)
original_env = gym.make("FrozenLake-v1", is_slippery=True)

# Custom 4x4 Lake (not slippery, fixed map)
custom_map = [
    "SFFF",
    "FHFH",
    "FFFH",
    "HFFG"
]
register(
    id="FrozenLake-Custom-v0",
    entry_point="gymnasium.envs.toy_text:FrozenLakeEnv",
    kwargs={"desc": custom_map, "is_slippery": False},
)
custom_env = gym.make("FrozenLake-Custom-v0")

# Expanded 8x8 Frozen Lake (slippery, random)
register(
    id="FrozenLake-Expanded-v0",
    entry_point="gymnasium.envs.toy_text:FrozenLakeEnv",
    kwargs={"desc": generate_random_map(size=8), "is_slippery": True},
)
expanded_env = gym.make("FrozenLake-Expanded-v0")


# 2. VALUE ITERATION

def value_iteration(env, theta=1e-8, gamma=0.99):
    """Use Bellman Optimality to directly estimate best value function and policy."""
    V = np.zeros(env.observation_space.n)  # Start with zero value for all states

    while True:
        delta = 0  # Track max change across states
        for s in range(env.observation_space.n):
            action_values = np.zeros(env.action_space.n)

            # Estimate value of each action from state s
            for a in range(env.action_space.n):
                for prob, next_state, reward, done in env.unwrapped.P[s][a]:
                    action_values[a] += prob * (reward + gamma * V[next_state])

            best_action_value = np.max(action_values)
            delta = max(delta, abs(best_action_value - V[s]))
            V[s] = best_action_value  # Update with best value

        if delta < theta:  # Converged
            break

    # Derive optimal policy from final value estimates
    policy = np.zeros(env.observation_space.n, dtype=int)
    for s in range(env.observation_space.n):
        action_values = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            for prob, next_state, reward, done in env.unwrapped.P[s][a]:
                action_values[a] += prob * (reward + gamma * V[next_state])
        policy[s] = np.argmax(action_values)

    return policy, V


# 3. POLICY ITERATION

def policy_iteration(env, gamma=0.99):
    """Alternate between evaluating the current policy and improving it."""
    policy = np.random.randint(env.action_space.n, size=env.observation_space.n)
    V = np.zeros(env.observation_space.n)

    while True:
        # Policy Evaluation Step
        while True:
            delta = 0
            for s in range(env.observation_space.n):
                v = V[s]
                a = policy[s]
                V[s] = sum([
                    prob * (reward + gamma * V[next_state])
                    for prob, next_state, reward, done in env.unwrapped.P[s][a]
                ])
                delta = max(delta, abs(v - V[s]))
            if delta < 1e-8:
                break

        # Policy Improvement Step
        stable = True
        for s in range(env.observation_space.n):
            old_action = policy[s]
            action_values = np.zeros(env.action_space.n)
            for a in range(env.action_space.n):
                for prob, next_state, reward, done in env.unwrapped.P[s][a]:
                    action_values[a] += prob * (reward + gamma * V[next_state])
            best_action = np.argmax(action_values)
            policy[s] = best_action
            if old_action != best_action:
                stable = False

        if stable:
            break

    return policy, V


# 4. POLICY EXECUTION

def evaluate_policy(env, policy, episodes=100):
    """Test how good the policy is by running it in the environment."""
    total_rewards = []
    total_steps = []

    for _ in range(episodes):
        state, _ = env.reset()
        done = False
        reward_sum = 0
        steps = 0

        while not done:
            state, reward, done, truncated, _ = env.step(policy[state])
            reward_sum += reward
            steps += 1

        total_rewards.append(reward_sum)
        total_steps.append(steps)

    avg_reward = np.mean(total_rewards)
    avg_steps = np.mean(total_steps)
    return avg_reward, avg_steps


# 5. TESTING ON ALL ENVIRONMENTS

def evaluate_all(envs, names):
    """Compare both algorithms across all environments."""
    for env in envs:
        print(f"\n🧪 Evaluating: {names[env]}")
        for label, algo in [("Value Iteration", value_iteration), ("Policy Iteration", policy_iteration)]:
            policy, _ = algo(env)
            avg_reward, avg_steps = evaluate_policy(env, policy)
            print(f"🔹 {label}: Avg Reward = {avg_reward:.2f}, Avg Steps = {avg_steps:.2f}")

# Run evaluation
env_dict = {
    original_env: "Original Frozen Lake (4x4 Slippery)",
    custom_env: "Custom Frozen Lake (4x4 Non-Slippery)",
    expanded_env: "Expanded Frozen Lake (8x8 Slippery)"
}
evaluate_all([original_env, custom_env, expanded_env], env_dict)



🧪 Evaluating: Original Frozen Lake (4x4 Slippery)
🔹 Value Iteration: Avg Reward = 0.84, Avg Steps = 48.12
🔹 Policy Iteration: Avg Reward = 0.88, Avg Steps = 43.04

🧪 Evaluating: Custom Frozen Lake (4x4 Non-Slippery)
🔹 Value Iteration: Avg Reward = 1.00, Avg Steps = 6.00
🔹 Policy Iteration: Avg Reward = 1.00, Avg Steps = 6.00

🧪 Evaluating: Expanded Frozen Lake (8x8 Slippery)
🔹 Value Iteration: Avg Reward = 0.20, Avg Steps = 32.96
🔹 Policy Iteration: Avg Reward = 0.20, Avg Steps = 30.28
