In [None]:
### Code cell 0 ###
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

np.set_printoptions(precision=3, suppress=True)  # pretty printing of numpy arrays

# Step 1 – Environment setup (FrozenLake)
env = gym.make('FrozenLake-v1', is_slippery=True, render_mode='ansi')
obs, info = env.reset()

print("Initial State:", obs)
print("Observation space size:", env.observation_space.n)  # 16 states
print("Action space size:", env.action_space.n)            # 4 actions

# Transition model
P = env.unwrapped.P
reward_min = min({r for s in P for a in P[s] for (_, _, r, _) in P[s][a]})
reward_max = max({r for s in P for a in P[s] for (_, _, r, _) in P[s][a]})
print("Reward range:", (reward_min, reward_max))

print(env.render())

# Step 3 – Value Iteration Algorithm
def value_iteration(env, discount_factor=0.99, theta=1e-6, max_iterations=10000):
    nS = env.observation_space.n
    nA = env.action_space.n
    P = env.unwrapped.P
    V = np.zeros(nS)

    for i in range(max_iterations):
        delta = 0
        for s in range(nS):
            q_sa = np.zeros(nA)
            for a in range(nA):
                for prob, next_state, reward, done in P[s][a]:
                    q_sa[a] += prob * (reward + discount_factor * V[next_state])
            new_v = np.max(q_sa)
            delta = max(delta, abs(new_v - V[s]))
            V[s] = new_v

        if delta < theta:
            break

    policy = extract_policy_from_v(env, V, discount_factor)
    return V, policy, i + 1


# Step 4 – Extract policy from V
def extract_policy_from_v(env, V, discount_factor=0.99):
    nS = env.observation_space.n
    nA = env.action_space.n
    P = env.unwrapped.P
    policy = np.zeros((nS, nA))

    for s in range(nS):
        q_sa = np.zeros(nA)
        for a in range(nA):
            for prob, next_state, reward, done in P[s][a]:
                q_sa[a] += prob * (reward + discount_factor * V[next_state])
        best_action = np.argmax(q_sa)
        policy[s] = np.eye(nA)[best_action]

    return policy


# Evaluate a learned policy by running episodes
def evaluate_policy(env, policy, n_episodes=1000):
    success = 0
    for _ in range(n_episodes):
        obs, _ = env.reset()
        done = False
        while not done:
            action = np.argmax(policy[obs])
            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            # in FrozenLake, reaching the goal gives reward=1 at the terminal step
            if done and reward > 0:
                success += 1
    return success / n_episodes


# Step 5 – Visualization: Value Function
def plot_values(env, V, gamma_label=""):
    plt.figure(figsize=(6, 3))
    plt.plot(V, marker='o')
    title = "Value Function for FrozenLake-v1"
    if gamma_label:
        title += f" (gamma={gamma_label})"
    plt.title(title)
    plt.xlabel("State (0–15)")
    plt.ylabel("Value")
    plt.grid(True)
    plt.show()


# Visualization: Policy
def plot_policy(env, policy, gamma_label=""):
    nS = env.observation_space.n
    actions = np.argmax(policy, axis=1)

    plt.figure(figsize=(6, 3))
    plt.bar(np.arange(nS), actions)
    title = "Greedy Policy (Best Action per State)"
    if gamma_label:
        title += f" (gamma={gamma_label})"
    plt.title(title)
    plt.xlabel("State Index (0–15)")
    plt.ylabel("Action (0=Left,1=Down,2=Right,3=Up)")
    plt.show()


# Step 6 – Run Value Iteration for multiple discount factors
if __name__ == "__main__":
    gammas = [0.9, 0.99, 0.6]
    all_V = {}
    all_iters = []
    success_rates = []

    for gamma in gammas:
        print(f"\n=== Running Value Iteration with gamma = {gamma} ===")
        V_opt, policy_opt, iterations = value_iteration(env, discount_factor=gamma)
        all_V[gamma] = V_opt
        all_iters.append(iterations)

        print(f"Converged in {iterations} iterations for gamma = {gamma}")

        # evaluate learned policy
        rate = evaluate_policy(env, policy_opt, n_episodes=1000)
        success_rates.append(rate)
        print(f"Success Rate for gamma={gamma}: {rate * 100:.2f}%")

        plot_values(env, V_opt, gamma_label=str(gamma))
        plot_policy(env, policy_opt, gamma_label=str(gamma))

    # Compare value functions for all gammas
    plt.figure(figsize=(8, 4))
    for gamma in gammas:
        plt.plot(all_V[gamma], label=f"gamma={gamma}")
    plt.title("Comparison of Value Functions for Different Discount Factors (FrozenLake)")
    plt.xlabel("State (0–15)")
    plt.ylabel("Value")
    plt.legend()
    plt.grid(True)
    plt.show()

    # Compare convergence speed
    plt.figure(figsize=(6, 4))
    plt.bar([str(g) for g in gammas], all_iters)
    plt.title("Convergence Speed vs Discount Factor (FrozenLake)")
    plt.xlabel("Gamma")
    plt.ylabel("Iterations to Converge")
    plt.show()

    # Plot success rate vs gamma
    plt.figure(figsize=(6, 4))
    plt.bar([str(g) for g in gammas], [r * 100 for r in success_rates])
    plt.title("Policy Success Rate vs Discount Factor (FrozenLake)")
    plt.xlabel("Gamma")
    plt.ylabel("Success Rate (%)")
    plt.show()


### Code cell 1 ###
