In [None]:
#  In this Uses random V (just for demonstration), not calucyltaed Computes Q(s,a) from that V,
#  Does greedy policy improvement to get a deterministic policy,
#  Plots both V(s) as numbers and π(s) as arrows.

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

np.set_printoptions(precision=3,suppress=True)  # show 3 decimal, no scientific format

# Create FrozenLake environment (slippery)
env=gym.make('FrozenLake-v1',is_slippery=True,render_mode='ansi')  # 4x4 lake, stochastic

obs,info=env.reset()
print("Initial State:",obs)
print("Action Space:",env.action_space)           # Discrete(4) -> 0=L,1=D,2=R,3=U
print("Observation Space:",env.observation_space) # Discrete(16) -> 16 states in 4x4 grid
print("Grid shape (rows, cols):",(4,4))

# Reward range from transition model
P=env.unwrapped.P   # full transition dictionary P[s][a] -> list of (prob,next_state,reward,done)
reward_min=min({r for s in P for a in P[s] for (_,_,r,_) in P[s][a]})
reward_max=max({r for s in P for a in P[s] for (_,_,r,_) in P[s][a]})
print("Reward range:",(reward_min,reward_max))

# Render text version of the lake
frame=env.render()  # shows S, F, H, G as text
print(frame)

#  Compute Q-values from a given value function V
def q_from_v(env,V,s,gamma=1.0):
    P=env.unwrapped.P
    nA=env.action_space.n
    q=np.zeros(nA)                 # q[a] will store Q(s,a)
    for a in range(nA):            # loop over all actions
        for prob,next_state,reward,done in P[s][a]:  # sum over all possible next states
            q[a]+=prob*(reward+gamma*V[next_state])  # Bellman expectation formula
    return q

#  Greedy policy improvement from a given V
def policy_improvement(env,V,discount_factor=1.0):
    nS=env.observation_space.n
    nA=env.action_space.n
    policy=np.zeros((nS,nA))       # each row = probability over actions in that state
    for s in range(nS):
        Q=q_from_v(env,V,s,discount_factor)  # compute Q(s,a) for all a
        best_action=np.argmax(Q)             # pick action with max Q
        policy[s]=np.eye(nA)[best_action]    # one-hot: 1.0 on best action, 0 for others
    return policy

# Visualization function
def plot(V,policy,discount_factor=1.0,draw_vals=True):
    nrow=env.unwrapped.nrow  # 4
    ncol=env.unwrapped.ncol  # 4
    arrow_symbols={0:'←',1:'↓',2:'→',3:'↑'}  # mapping action index to arrow

    grid=np.reshape(V,(nrow,ncol))  # reshape V into 4x4 grid
    plt.figure(figsize=(6,6))
    plt.imshow(grid,cmap='cool',interpolation='none')  # color map based on V

    for s in range(nrow*ncol):
        row,col=divmod(s,ncol)        # convert state index -> (row,col)
        best_action=np.argmax(policy[s])

        if draw_vals:
            # print value inside cell
            plt.text(col,row,f'{V[s]:.2f}',ha='center',va='center',
                     color='white',fontsize=10)
        else:
            # print arrow for best action
            plt.text(col,row,arrow_symbols[best_action],ha='center',va='center',
                     color='white',fontsize=14)

    plt.title("Value Function" if draw_vals else "Policy")
    plt.axis('off')
    plt.show()


# Run policy improvement and visualize

# Example value function (random initialization)
V=np.random.rand(env.observation_space.n)   # random V, not learned, just to test code

policy=policy_improvement(env,V)            # compute greedy policy from this V

# Plot Value Function (numbers in grid)
plot(V,policy,1.0,draw_vals=True)

# Plot Policy (arrows only)
plot(V,policy,1.0,draw_vals=False)
