In [None]:

### Code cell 0 ###
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

np.set_printoptions(precision=3, suppress=True) #for pretty printing of nupyarray, suppes=true means do not show 1.0e values

env=gym.make('FrozenLake-v1',render_mode='ansi') #ansi=text form of girl insated of grahival window
state=env.reset()

print("Initial State:",state)
print("Action Space:",env.action_space)
print("Observation Space:",env.observation_space)
print("Grid shape (rows, cols):",(4,4))
print("Reward range:",(min({r for s in env.unwrapped.P
                            for a in env.unwrapped.P[s]
                            for (_,_,r,_) in env.unwrapped.P[s][a]}), # _ means ignorieng other just care about reward
                       max({r for s in env.unwrapped.P
                            for a in env.unwrapped.P[s]
                            for (_,_,r,_) in env.unwrapped.P[s][a]})))
#here env.unwrpapped in trabsition model for each state and actoion a, P[s][a] is a list if tuple. whch is (prob,next_sate,reward,done) collect rewards and prints min, max.

obs, info = env.reset(seed=42) #for same behvaior
frame = env.render()  # returns ANSI text since render_mode='ansi'
print(frame)

gamma=0.99
theta=1e-8
V=np.zeros(env.observation_space.n)

P=env.unwrapped.P #saving transitaon model in P

nS=env.observation_space.n    # number of states
nA=env.action_space.n

def value_iteration(env,gamma,theta=1e-4,max_iter=1000):
    V=np.zeros(nS)                # start with V(s)=0 for all states
    it=0
    while it<max_iter:
        delta=0.0
        for s in range(nS):
            v=V[s]
            q_sa=[]
            for a in range(nA):
                q=0.0
                for prob,next_s,reward,done in P[s][a]: #p[s][a[ to sum ovr all state and actions
                    q+=prob*(reward+gamma*V[next_s]) #for each state, action compute extected return q(s,a)
                q_sa.append(q)
            V[s]=max(q_sa)
            delta=max(delta,abs(v-V[s])) #how much vale changed
        it+=1
        if delta<theta:break #means converged.
    return V

V_opt=value_iteration(env,gamma,theta)
print("Optimal state values V*:")
print("V* reshaped as 4x4 grid:")
print(V_opt.reshape(4,4))

def optimal_policy(env,V,gamma):
    P=env.unwrapped.P
    nS=env.observation_space.n
    nA=env.action_space.n
    policy=np.zeros((nS,nA))
    for s in range(nS):
        q_sa=np.zeros(nA)
        for a in range(nA):
            for prob,next_state,reward,done in P[s][a]:
                q_sa[a]+=prob*(reward+gamma*V[next_state]) # again computer Q(s,a) but here using final V*
        best_action=np.argmax(q_sa)
        policy[s][best_action]=1.0 #making that 1 prob, rest 0. to show that that is best
    return policy # policy of shape ns,na. each row show proab over actions
V_opt=value_iteration(env,gamma,theta)
policy=optimal_policy(env,V_opt,gamma)
print("Optimal policy (one-hot over actions):")
print(policy)

#Means: Action LEFT has probability 1.0 All other actions have probability 0

def plot(V,policy,col_ramp=1,dpi=175,draw_vals=False): #using coll clor map, dot pr inch sharper image
    #    Visualize FrozenLake state values V and policy arrows.

    plt.rcParams['figure.dpi']=dpi #global value setting
    plt.rcParams.update({'axes.edgecolor':(0.32,0.36,0.38)})
    plt.rcParams.update({'font.size':6 if env.unwrapped.nrow==8 else 8}) # 8 fomt sizee styling optns
    plt.figure(figsize=(3,3))

    desc=env.unwrapped.desc
    nrow,ncol=desc.shape # grid size
    V_sq=V.reshape((nrow,ncol)) #rehspae value into same grid size eree 4*4 4x4

    plt.imshow(V_sq,cmap='cool' if col_ramp else 'gray',alpha=0.7) #imshow for color intenisty. high value high intensity
    ax=plt.gca()

    arrow_dict={0:'←',1:'↓',2:'→',3:'↑'}

    for x in range(ncol+1):  # for grid lines
        ax.axvline(x-0.5,lw=0.5,color='black')
    for y in range(nrow+1):
        ax.axhline(y-0.5,lw=0.5,color='black')

    for r in range(nrow): # for each cell convert to state using row,col
        for c in range(ncol):
            s=r*ncol+c
            val=V[s]
            tile=desc[r,c].decode('utf-8')  # turn b'S' into 'S'.  2-d array to normal string

            if tile=='H':color='red' #pick colro form title letter
            elif tile=='G':color='green'
            elif tile=='S':color='blue'
            else:color='black'

            plt.text(c,r,tile,ha='center',va='center',color=color,fontsize=10,fontweight='bold')

            if draw_vals and tile not in ['H']: # prit value if not hole
                plt.text(c,r+0.3,f"{val:.2f}",ha='center',va='center',color='black',fontsize=6)

            if policy is not None: # find best action from policy row and draw arrow in that directin
                best_action=np.argmax(policy[s])
                plt.text(c,r-0.25,arrow_dict[best_action],ha='center',va='center',color='purple',fontsize=12)

    plt.title("FrozenLake: Policy and State Values")
    plt.axis('off')
    plt.show()

V_opt=value_iteration(env,gamma,theta)
policy=optimal_policy(env,V_opt,gamma)
plot(V_opt,policy,draw_vals=True)

### Code cell 1 ###
