In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

# Create environment
env=gym.make("FrozenLake-v1",is_slippery=False)

# Monte Carlo Prediction
def mc_prediction(env,policy,episodes=10000,gamma=0.99):
    V=np.zeros(env.observation_space.n)
    returns={s:[] for s in range(env.observation_space.n)}  #dict where each state s has list of all observed return from s
    V_track=[] # tracking, for ploingt

    for ep in range(episodes):
        episode=[] #stor list stte,reward pairs
        state,_=env.reset()
        done=False

        # Generate an episode following the policy
        while not done:
            action=policy[state] #pick actin accoring to policy
            next_state,reward,terminated,truncated,_=env.step(action)
            done=terminated or truncated
            episode.append((state,reward))
            state=next_state #after after while loop you have one fill episode [(s0,r1), (s1,r2),..]

        # Compute returns (first-visit MC)
        G=0.0 #cumulative discounted return
        visited_states=set() # so that there ar no duplocaetes
        for s,r in reversed(episode): #iterting backward thrugh episode
            G=gamma*G+r #gt=rt+gamma*g(t+1)
            if s not in visited_states: #only updae if first time never visted
                returns[s].append(G)
                V[s]=np.mean(returns[s]) # as MC is avergae
                visited_states.add(s)

        V_track.append(V.copy())

    return V,V_track #v is final value function, v_track 10000 track

# Temporal Difference (TD(0))
def td_prediction(env,policy,episodes=10000,alpha=0.1,gamma=0.99): #alpha=learning rate
    V=np.zeros(env.observation_space.n)
    V_track=[]

    for ep in range(episodes):
        state,_=env.reset()
        done=False

        while not done:
            action=policy[state]
            next_state,reward,terminated,truncated,_=env.step(action)
            done=terminated or truncated

            # TD(0) update
            V[state]=V[state]+alpha*(reward+gamma*V[next_state]-V[state]) # V(st​) <- V(st​)+alpha[r(t+1)​+γV(st+1​)−V(st​)]

            state=next_state

        V_track.append(V.copy())

    return V,V_track

# Define Random Policy
policy={s:np.random.choice([0,1,2,3]) for s in range(env.observation_space.n)} # make dic having state: any random acton 0:left, 1:down so onn..

# Compute Value Functions
V_mc,V_mc_track=mc_prediction(env,policy)
V_td,V_td_track=td_prediction(env,policy)

# Print Results
print("Monte Carlo Value Function:")
print(np.round(V_mc,3))

print("\nTD(0) Value Function:")
print(np.round(V_td,3))

# Plotting Convergence
def plot_convergence(V_track,title):
    plt.figure(figsize=(8,5)) #take value of state s from every snapshot stored across episodes
    for s in range(env.observation_space.n):
        values=[v[s] for v in V_track]
        plt.plot(values,label=f"State {s}")
    plt.title(title)
    plt.xlabel("Episodes")
    plt.ylabel("Value Estimate V(s)")
    plt.legend()
    plt.grid(True)
    plt.show() #plot how v(s) changes over time

plot_convergence(V_mc_track,"Monte Carlo Value Estimates Over Episodes")
plot_convergence(V_td_track,"TD(0) Value Estimates Over Episodes")
