In [None]:
### Code cell 0 ###
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

np.set_printoptions(precision=3, suppress=True) #for pretty printing of nupyarray, suppes=true means do not show 1.0e values

env=gym.make('FrozenLake-v1',render_mode='ansi') #ansi=text form of girl insated of grahival window
state=env.reset()

print("Initial State:",state)
print("Action Space:",env.action_space)
print("Observation Space:",env.observation_space)
print("Grid shape (rows, cols):",(4,4))
print("Reward range:",(min({r for s in env.unwrapped.P
                            for a in env.unwrapped.P[s]
                            for (_,_,r,_) in env.unwrapped.P[s][a]}), # _ means ignorieng other just care about reward
                       max({r for s in env.unwrapped.P
                            for a in env.unwrapped.P[s]
                            for (_,_,r,_) in env.unwrapped.P[s][a]})))
#here env.unwrpapped in trabsition model for each state and actoion a, P[s][a] is a list if tuple. whch is (prob,next_sate,reward,done) collect rewards and prints min, max.

obs, info = env.reset(seed=42) #for same behvaior
frame = env.render()  # returns ANSI text since render_mode='ansi'
print(frame)

gamma=0.99
theta=1e-8

P=env.unwrapped.P #saving transitaon model in P
nS=env.observation_space.n    # number of states
nA=env.action_space.n         # number of actions


def plot(V,policy,col_ramp=1,dpi=175,draw_vals=False): #using coll clor map, dot pr inch sharper image
    #    Visualize FrozenLake state values V and policy arrows.

    plt.rcParams['figure.dpi']=dpi #global value setting
    plt.rcParams.update({'axes.edgecolor':(0.32,0.36,0.38)})
    plt.rcParams.update({'font.size':6 if env.unwrapped.nrow==8 else 8}) # 8 fomt sizee styling optns
    plt.figure(figsize=(3,3))

    desc=env.unwrapped.desc
    nrow,ncol=desc.shape # grid size
    V_sq=V.reshape((nrow,ncol)) #rehspae value into same grid size eree 4*4 4x4

    plt.imshow(V_sq,cmap='cool' if col_ramp else 'gray',alpha=0.7) #imshow for color intenisty. high value high intensity
    ax=plt.gca()

    arrow_dict={0:'←',1:'↓',2:'→',3:'↑'}

    for x in range(ncol+1):  # for grid lines
        ax.axvline(x-0.5,lw=0.5,color='black')
    for y in range(nrow+1):
        ax.axhline(y-0.5,lw=0.5,color='black')

    for r in range(nrow): # for each cell convert to state using row,col
        for c in range(ncol):
            s=r*ncol+c
            val=V[s]
            tile=desc[r,c].decode('utf-8')  # turn b'S' into 'S'.  2-d array to normal string

            if tile=='H':color='red' #pick colro form title letter
            elif tile=='G':color='green'
            elif tile=='S':color='blue'
            else:color='black'

            plt.text(c,r,tile,ha='center',va='center',color=color,fontsize=10,fontweight='bold')

            if draw_vals and tile not in ['H']: # prit value if not hole
                plt.text(c,r+0.3,f"{val:.2f}",ha='center',va='center',color='black',fontsize=6)

            if policy is not None: # find best action from policy row and draw arrow in that directin
                best_action=np.argmax(policy[s])
                plt.text(c,r-0.25,arrow_dict[best_action],ha='center',va='center',color='purple',fontsize=12)

    plt.title("FrozenLake: Policy and State Values")
    plt.axis('off')
    plt.show()


# random policy initialisation
policy=np.ones((nS,nA))/nA      # random policy: each action 1/nA


def policy_evaluation(env,policy,discount_factor=1.0,theta=1e-9,draw=False):
    nS=env.observation_space.n
    nA=env.action_space.n
    V=np.zeros(nS)              # initial V(s)=0 for all states
    P=env.unwrapped.P           # transition dynamics: P[s][a] -> list of (prob,next_state,reward,done)

    while True:
        delta=0.0
        for s in range(nS):
            v=0.0
            for a,action_prob in enumerate(policy[s]):
                if action_prob==0:continue
                for prob,next_state,reward,done in P[s][a]:
                    v+=action_prob*prob*(reward+discount_factor*V[next_state])
            delta=max(delta,abs(V[s]-v))
            V[s]=v
        if delta<theta:break

    if draw:
        side=int(np.sqrt(nS))   # works for 4x4 FrozenLake
        print("Value function after policy evaluation:")
        print(V.reshape((side,side)))
    return V


V=policy_evaluation(env,policy,discount_factor=gamma,theta=theta,draw=True)
plot(V,policy,draw_vals=True)

### Code cell 1 ###
