In [19]:
import numpy as np

rows,cols=3,4
states=[(i,j) for i in range(rows) for j in range(cols)]  ## 2 inner loops. Fill values using i,i. I starts from 0,1,2 same for j.
wall=(1,2)
goal=(0,3)
danger=(1,3)
states.remove(wall)
actions=["UP","DOWN","LEFT","RIGHT"]

def reward(state):
    if state==goal:
        return 1.0
    elif state==danger:
        return -1.0
    else:
        return -0.04


# Step 2: Transition Model (80-10-10)
def next_state(state,action):
    """Deterministic transition before adding stochasticity."""
    i,j=state
    if action=="UP":
        i=max(i-1,0)
    elif action=="DOWN":
        i=min(i+1,rows-1)
    elif action=="LEFT":
        j=max(j-1,0)
    elif action=="RIGHT":
        j=min(j+1,cols-1)
    if (i,j)==wall:
        return state
    return (i,j)

def transition_probabilities(state,action):
    """Stochastic transition model with 80-10-10 rule."""
    if state in [goal,danger]:
        return {state:1.0}
    probs={}  # creating dic e.g. For example, for state=(2,0), action="UP",probs becomes:(1,0): 0.8,   # intended UP

    intended=next_state(state,action)
    if action=="UP":
        left,right="LEFT","RIGHT"
    elif action=="DOWN":
        left,right="RIGHT","LEFT"
    elif action=="LEFT":
        left,right="DOWN","UP"
    else: # RIGHT
        left,right="UP","DOWN"
    slip_left=next_state(state,left)
    slip_right=next_state(state,right)
    probs[intended]=probs.get(intended,0)+0.8
    probs[slip_left]=probs.get(slip_left,0)+0.1
    probs[slip_right]=probs.get(slip_right,0)+0.1
    return probs

# Step 3: Testing with Discount Factor (Î³)
gamma=0.9


state=(0,2)
action="RIGHT"
transitions=transition_probabilities(state,action)
print(f"\nFrom state {state}, action={action}:")
for next_s,prob in transitions.items():
    print(f" -> {next_s} with P={prob:.2f}, Reward={reward(next_s)}")

examples=[((1,0),"RIGHT"),((2,2),"UP"),((0,0),"LEFT"),((1,1),"DOWN"),((0,3),"LEFT"),((1,3),"UP")]
for state,action in examples:
    transitions=transition_probabilities(state,action)
    print(f"\nFrom state {state}, action={action}:")
    for next_s,prob in transitions.items():
        print(f" -> {next_s} with P={prob:.2f}, Reward={reward(next_s)}")

def value_iteration(gamma,theta=1e-4,max_iter=1000):
    V={s:0.0 for s in states} # dictory (0,0):0.0, states: value
    it=0
    while it<max_iter:
        delta=0.0
        for s in states:
            if s in [goal,danger]:  # terminal states: value = immediate reward
                V[s]=reward(s) #end
                continue
            v_old=V[s] # non-terminal states computes bellman optimality
            best_q=-1e9 #start with small num replace quilckly
            for a in actions: #loop over all actins
                trans=transition_probabilities(s,a) #return dict like next_state_1, proab_1 same..
                q=0.0
                for next_s,prob in trans.items():
                    r=reward(next_s)
                    q+=prob*(r+gamma*V[next_s])
                if q>best_q:best_q=q #pick best q over all actions
            V[s]=best_q
            delta=max(delta,abs(v_old-best_q))
        it+=1
        if delta<theta:break #if all stte converges little we assume it converged
    return V

def GammaExperiment():
    gammas=[0.0,0.5,0.9,1.0]
    start=(2,0)   # bottom-left start like in Sutton GridWorld
    mid=(1,1)     # a middle state
    near_goal=(0,2)
    for g in gammas:
        V=value_iteration(g)
        print(f"\n=== gamma={g} ===")
        print("V(start (2,0))     =",round(V[start],3))
        print("V(middle (1,1))    =",round(V[mid],3))
        print("V(near goal (0,2)) =",round(V[near_goal],3))

GammaExperiment()

print("")

def value_iteration_with_policy(gamma,theta=1e-4,max_iter=1000):
    V={s:0.0 for s in states}      # value function V(s)
    policy={s:None for s in states}# optimal action pi(s)
    it=0
    while it<max_iter:
        delta=0.0
        for s in states:
            if s in [goal,danger]: # terminal: value=reward, no action
                V[s]=reward(s)
                policy[s]=None
                continue
            v_old=V[s]
            best_q=-1e9
            best_a=None
            for a in actions:
                trans=transition_probabilities(s,a)
                q=0.0
                for next_s,prob in trans.items():
                    r=reward(next_s)
                    q+=prob*(r+gamma*V[next_s])  # Bellman optimality
                if q>best_q:
                    best_q=q
                    best_a=a
            V[s]=best_q
            policy[s]=best_a
            delta=max(delta,abs(v_old-best_q))
        it+=1
        if delta<theta:break
    return V,policy

def visualize_policy(policy):
    import numpy as np
    arrow={"UP":"^","DOWN":"v","LEFT":"<","RIGHT":">"}
    grid=np.full((rows,cols)," ",dtype='<U2')
    for i in range(rows):
        for j in range(cols):
            s=(i,j)
            if s==wall:
                grid[i,j]="#"
            elif s==goal:
                grid[i,j]="G"
            elif s==danger:
                grid[i,j]="D"
            else:
                a=policy.get(s)
                grid[i,j]=arrow.get(a,".")
    print("Optimal Policy (arrows):")
    print(grid)

gamma=0.9
V,policy=value_iteration_with_policy(gamma)
visualize_policy(policy)


def optimal_path(start,gamma=0.9,max_steps=20):
    V,policy=value_iteration_with_policy(gamma)
    s=start
    path=[s]
    for step in range(max_steps):
        if s in [goal,danger]:break
        a=policy[s]
        if a is None:break
        s=next_state(s,a)   # deterministic intended move (no slip)
        path.append(s)
    return path
def visualize_path(path):
    import numpy as np
    grid=np.full((rows,cols),"-",dtype='<U2')
    for step,s in enumerate(path):
        i,j=s
        grid[i,j]=str(step) # mark visit order
    # mark special cells
    gi,gj=goal
    grid[gi,gj]="G"
    di,dj=danger
    grid[di,dj]="D" #danger
    wi,wj=wall
    grid[wi,wj]="#"
    print("Optimal Path (step numbers):")
    print(grid)
start=(2,0)  # bottom-left
path=optimal_path(start,gamma=0.9,max_steps=20)
print("Path states:",path)
visualize_path(path)


From state (2, 0), action=UP:
 -> (1, 0) with P=0.80, Reward=-0.04
 -> (2, 0) with P=0.10, Reward=-0.04
 -> (2, 1) with P=0.10, Reward=-0.04

From state (0, 2), action=RIGHT:
 -> (0, 3) with P=0.80, Reward=1.0
 -> (0, 2) with P=0.20, Reward=-0.04

From state (1, 0), action=RIGHT:
 -> (1, 1) with P=0.80, Reward=-0.04
 -> (0, 0) with P=0.10, Reward=-0.04
 -> (2, 0) with P=0.10, Reward=-0.04

From state (2, 2), action=UP:
 -> (2, 2) with P=0.80, Reward=-0.04
 -> (2, 1) with P=0.10, Reward=-0.04
 -> (2, 3) with P=0.10, Reward=-0.04

From state (0, 0), action=LEFT:
 -> (0, 0) with P=0.90, Reward=-0.04
 -> (1, 0) with P=0.10, Reward=-0.04

From state (1, 1), action=DOWN:
 -> (2, 1) with P=0.80, Reward=-0.04
 -> (1, 1) with P=0.10, Reward=-0.04
 -> (1, 0) with P=0.10, Reward=-0.04

From state (0, 3), action=LEFT:
 -> (0, 3) with P=1.00, Reward=1.0

From state (1, 3), action=UP:
 -> (1, 3) with P=1.00, Reward=-1.0

=== gamma=0.0 ===
V(start (2,0))     = -0.04
V(middle (1,1))    = -0.04
V(near 