In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

np.set_printoptions(precision=3,suppress=True)  # show 3 decimal, no scientific format

# Create FrozenLake environment (slippery)
env=gym.make('FrozenLake-v1',is_slippery=True,render_mode='ansi')  # 4x4 lake, stochastic

obs,info=env.reset()
print("Initial State:",obs)
print("Action Space:",env.action_space)           # Discrete(4) -> 0=L,1=D,2=R,3=U
print("Observation Space:",env.observation_space) # Discrete(16) -> 16 states in 4x4 grid
print("Grid shape (rows, cols):",(4,4))

# Reward range from transition model
P=env.unwrapped.P   # full transition dictionary P[s][a] -> list of (prob,next_state,reward,done)
reward_min=min({r for s in P for a in P[s] for (_,_,r,_) in P[s][a]})
reward_max=max({r for s in P for a in P[s] for (_,_,r,_) in P[s][a]})
print("Reward range:",(reward_min,reward_max))

# Render text version of the lake
frame=env.render()  # shows S, F, H, G as text
print(frame)

gamma=0.99

# helper: compute Q from V
def q_from_v(env,V,s,gamma=1.0):
    P=env.unwrapped.P
    nA=env.action_space.n
    q=np.zeros(nA)                 # q[a] will store Q(s,a)
    for a in range(nA):            # loop over all actions
        for prob,next_state,reward,done in P[s][a]:  # sum over all possible next states
            q[a]+=prob*(reward+gamma*V[next_state])  # Bellman expectation formula
    return q

# simple one-step policy improvement
def simple_policy_improvement(env,policy,V,gamma=0.99):
    P=env.unwrapped.P
    nS=env.observation_space.n
    nA=env.action_space.n
    policy_new=np.zeros_like(policy)

    for s in range(nS):
        old_action=np.argmax(policy[s])  # action with highest prob in old policy

        # compute Q(s,a) for all actions
        Q_sa=np.zeros(nA)
        for a in range(nA):
            for prob,next_state,reward,done in P[s][a]:
                Q_sa[a]+=prob*(reward+gamma*V[next_state])

        best_action=np.argmax(Q_sa)      # greedy argmax_a Σ_s' P(...) [R + γV]

        policy_new[s]=np.eye(nA)[best_action]  # deterministic one-hot

        print(f"State {s:2d}: old_action={old_action}, new_action={best_action}")

    return policy_new

# plotting helper: show arrows for a policy
def plot_policy(V,policy,title="Policy",draw_vals=False):
    nrow=env.unwrapped.nrow  # 4
    ncol=env.unwrapped.ncol  # 4
    arrow_symbols={0:'←',1:'↓',2:'→',3:'↑'}  # mapping action index to arrow

    grid=V.reshape((nrow,ncol))
    plt.figure(figsize=(5,5))
    plt.imshow(grid,cmap='cool',interpolation='none')

    for s in range(nrow*ncol):
        row,col=divmod(s,ncol)
        best_action=np.argmax(policy[s])

        if draw_vals:
            plt.text(col,row,f'{V[s]:.2f}',ha='center',va='center',
                     color='white',fontsize=10)
        plt.text(col,row,arrow_symbols[best_action],ha='center',va='center',
                 color='yellow',fontsize=14)

    plt.title(title)
    plt.axis('off')
    plt.show()

# run: one-step improvement and plots

nS=env.observation_space.n
nA=env.action_space.n

# example value function (here random, in real case from evaluation / value-iteration)
V=np.random.rand(nS)
print("\nExample V(s) (4x4):")
print(V.reshape(4,4))

# old policy: uniform random over actions
policy_old=np.ones((nS,nA))/nA
print("\nOld policy (row=state, cols=actions 0..3):")
print(policy_old)

print("\n--- Running one-step policy improvement ---")
policy_new=simple_policy_improvement(env,policy_old,V,gamma)

print("\nNew policy (row=state, cols=actions 0..3):")
print(policy_new)

# plot old vs new policy
plot_policy(V,policy_old,title="Old Policy (before improvement)",draw_vals=True)
plot_policy(V,policy_new,title="New Policy (after improvement)",draw_vals=True)
