In [1]:
import numpy as np

In [2]:
# Initialize states and random policy

def initialization():
    '''
    Returns:
            V - a 2d array initialized as 0
            R - dictionary containing rewards for each state
            P - dictionary denoting equiprobable random policy
            states - array containing tuples of states in the gridworld
            terminal_states - array containing terminal states of the gridworld
    '''
    num_rows = 4
    num_cols = 4
    states = []
    for i in range(num_rows):
        for j in range(num_cols):
            states.append((i,j))
    terminal_states = [(0,0),(3,3)]
    V = np.zeros([4,4])
    R = {}
    P = {}
    for state in states:
        if state in terminal_states:
            R[state] = 0
            P[state] = []
        else:
            R[state] = -1
            P[state] = ['L','R','D','U']
    return V,R,P,states,terminal_states

In [3]:
# Function to implement Value iteration
def value_iteration(V,P,R,states,terminal_states):
    '''
    Arguments:
            V - a 2d array initialized as 0
            R - dictionary containing rewards for each state
            P - dictionary denoting equiprobable random policy
            states - array containing tuples of states in the gridworld
            terminal_states - array containing terminal states of the gridworld
    Returns:
            P - optimal policy
    '''
    Actions = [[0,-1,'L'],[-1,0,'U'],[0,1,'R'],[1,0,'D']]
    num_iterations = 0

    # Loop for convergence of value function
    while num_iterations < 1000:
        
        V1 = {}
        for state in states:
            if state in terminal_states:
                V1[state] = 0
                continue
            V1[state] = -1e10
            
        # Evaluate value of each state using DP methods
        for state in states:
            if state in terminal_states:
                continue
            moves = len(P[state])
            for a in Actions:
                (x,y) = (state[0] + a[0],state[1] + a[1])
                if x in range(4) and y in range(4):
                    V1[state] = max(V1[state],(R[state] + V[(x,y)]))    
                    
        V = V1
        num_iterations += 1
        
    # Find optimal policy using precomputed values of all states 
    for state in states:
        if state in terminal_states:
            continue
        optimal_policy = []
        max_val = -1e10
        for a in Actions:
            (x,y) = (state[0] + a[0],state[1] + a[1])
            if x in range(4) and y in range(4):
                val = -1 + V[(x,y)]
                if val > max_val:
                    max_val = val
                    optimal_policy = [a[2]]
                elif val == max_val:
                    optimal_policy.append(a[2])
        P[state] = optimal_policy
        

    return P

In [4]:
V,R,P,states,terminal_states = initialization()
P = value_iteration(V,P,R,states,terminal_states)

# Print optimal policy
# Each cell denotes the optimal action that needs to be taken in that state
print("Optimal policy is: ")
for row in range(4):
    for col in range(4):
        print(P[(row,col)],end=' ')
    print()

Optimal policy is: 
[] ['L'] ['L'] ['L', 'D'] 
['U'] ['L', 'U'] ['L', 'U', 'R', 'D'] ['D'] 
['U'] ['L', 'U', 'R', 'D'] ['R', 'D'] ['D'] 
['U', 'R'] ['R'] ['R'] [] 
