In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

# Create environment
env=gym.make("FrozenLake-v1",is_slippery=True)

# Monte Carlo Prediction
def mc_prediction(env,policy,episodes=10000,gamma=0.99):
    V=np.zeros(env.observation_space.n)
    returns={s:[] for s in range(env.observation_space.n)}  #dict where each state s has list of all observed return from s
    V_track=[] # tracking, for ploingt
    ep_rewards=[]  # store total reward per episode

    for ep in range(episodes):
        episode=[] #stor list stte,reward pairs
        state,_=env.reset()
        done=False
        total_reward=0.0

        # Generate an episode following the policy
        while not done:
            action=policy[state] #pick actin accoring to policy
            next_state,reward,terminated,truncated,_=env.step(action)
            done=terminated or truncated
            episode.append((state,reward))
            total_reward+=reward
            state=next_state #after after while loop you have one fill episode [(s0,r1), (s1,r2),..]

        # Compute returns (first-visit MC)
        G=0.0 #cumulative discounted return
        visited_states=set() # so that there ar no duplocaetes
        for s,r in reversed(episode): #iterting backward thrugh episode
            G=gamma*G+r #gt=rt+gamma*g(t+1)
            if s not in visited_states: #only updae if first time never visted
                returns[s].append(G)
                V[s]=np.mean(returns[s]) # as MC is avergae
                visited_states.add(s)

        V_track.append(V.copy())
        ep_rewards.append(total_reward)

    return V,V_track,ep_rewards #v is final value function, v_track 10000 track

# Temporal Difference (TD(0))
def td_prediction(env,policy,episodes=10000,alpha=0.1,gamma=0.99): #alpha=learning rate
    V=np.zeros(env.observation_space.n)
    V_track=[]
    ep_rewards=[]  # store total reward per episode

    for ep in range(episodes):
        state,_=env.reset()
        done=False
        total_reward=0.0

        while not done:
            action=policy[state]
            next_state,reward,terminated,truncated,_=env.step(action)
            done=terminated or truncated
            total_reward+=reward

            # TD(0) update
            V[state]=V[state]+alpha*(reward+gamma*V[next_state]-V[state]) # V(st​) <- V(st​)+alpha[r(t+1)​+γV(st+1​)−V(st​)]

            state=next_state

        V_track.append(V.copy())
        ep_rewards.append(total_reward)

    return V,V_track,ep_rewards

# Define Random Policy
#policy={s:np.random.choice([0,1,2,3]) for s in range(env.observation_space.n)} # make dic having state: any random acton 0:left, 1:down so onn..

policy = {s: 2 for s in range(16)}  #policy always moves right

# Compute Value Functions once with default hyperparameters
V_mc,V_mc_track,rewards_mc=mc_prediction(env,policy)
V_td,V_td_track,rewards_td=td_prediction(env,policy)


# Plotting Convergence
def plot_convergence(V_track,title):
    plt.figure(figsize=(8,5)) #take value of state s from every snapshot stored across episodes
    for s in range(env.observation_space.n):
        values=[v[s] for v in V_track]
        plt.plot(values,label=f"State {s}")
    plt.title(title)
    plt.xlabel("Episodes")
    plt.ylabel("Value Estimate V(s)")
    plt.legend()
    plt.grid(True)
    plt.show() #plot how v(s) changes over time

# NEW: plot episode rewards + running average
def plot_rewards(ep_rewards,title):
    plt.figure(figsize=(8,5))
    episodes=len(ep_rewards)
    avg_rewards=np.cumsum(ep_rewards)/np.arange(1,episodes+1)  # running average
    plt.plot(ep_rewards,alpha=0.3,label="Episode reward")
    plt.plot(avg_rewards,label="Average reward")
    plt.title(title)
    plt.xlabel("Episodes")
    plt.ylabel("Reward")
    plt.legend()
    plt.grid(True)
    plt.show()


# ifferent alpha and gamma values

gammas=[0.9,0.99]
alphas=[0.05,0.2]

#  Monte Carlo: effect of gamma (no alpha in MC)
for g in gammas:
    V_mc_g,V_mc_track_g,rewards_mc_g=mc_prediction(env,policy,episodes=10000,gamma=g)
    plot_convergence(V_mc_track_g,f"Monte Carlo Value Estimates (gamma={g})")
    plot_rewards(rewards_mc_g,f"Monte Carlo Episode Rewards (gamma={g})")

# TD(0): effect of both alpha and gamma
for g in gammas:
    for a in alphas:
        V_td_ga,V_td_track_ga,rewards_td_ga=td_prediction(env,policy,episodes=10000,
                                            alpha=a,gamma=g)
        title=f"TD(0) Value Estimates (alpha={a}, gamma={g})"
        plot_convergence(V_td_track_ga,title)
        plot_rewards(rewards_td_ga,f"TD(0) Episode Rewards (alpha={a}, gamma={g})")

# when everything random, plocy deterrminic never reach goal reward =0.
# some states may reaxh goal. value chnages grahs vigly as agent going here and there.
# in md when gamma raete values becomes larger as shown in graph. stable but slow.
#in td:
#low learning rate = slow but stable learning
# high alpah= fas updayes but unstable

# in rewatds"
#it become smooth over time total reward whch is averge
