In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

np.set_printoptions(precision=3, suppress=True)

env=gym.make('FrozenLake-v1',render_mode='ansi')
state=env.reset()

print("Initial State:",state)
print("Action Space:",env.action_space)
print("Observation Space:",env.observation_space)
print("Grid shape (rows, cols):",(5,5))
print("Reward range:",(min({r for s in env.unwrapped.P for a in env.unwrapped.P[s] for (_,_,r,_) in env.unwrapped.P[s][a]}),max({r for s in env.unwrapped.P for a in env.unwrapped.P[s] for (_,_,r,_) in env.unwrapped.P[s][a]})))

obs, info = env.reset(seed=42)
frame = env.render()  # returns ANSI text since render_mode='ansi'
print(frame)

gamma=0.99
theta=1e-8
V=np.zeros(env.observation_space.n)

P=env.unwrapped.P
print(P)
nS=env.observation_space.n    # number of states
nA=env.action_space.n

def value_iteration(env,gamma,theta=1e-4,max_iter=1000):
    V=np.zeros(nS)                # start with V(s)=0 for all states
    it=0
    while it<max_iter:
        delta=0.0
        for s in range(nS):
            v=V[s]
            q_sa=[]
            for a in range(nA):
                q=0.0
                for prob,next_s,reward,done in P[s][a]:
                    q+=prob*(reward+gamma*V[next_s])
                q_sa.append(q)
            V[s]=max(q_sa)
            delta=max(delta,abs(v-V[s]))
        it+=1
        if delta<theta:break
    return V

V_opt=value_iteration(env,gamma,theta)
print("Optimal state values V*:")
print("V* reshaped as 4x4 grid:")
print(V_opt.reshape(4,4))

def optimal_policy(env,V,gamma):
    P=env.unwrapped.P
    nS=env.observation_space.n
    nA=env.action_space.n
    policy=np.zeros((nS,nA))
    for s in range(nS):
        q_sa=np.zeros(nA)
        for a in range(nA):
            for prob,next_state,reward,done in P[s][a]:
                q_sa[a]+=prob*(reward+gamma*V[next_state])
        best_action=np.argmax(q_sa)
        policy[s][best_action]=1.0
    return policy
V_opt=value_iteration(env,gamma,theta)
policy=optimal_policy(env,V_opt,gamma)
print("Optimal policy (one-hot over actions):")
print(policy)

