In [1]:
import numpy as np

In [2]:

def initialization():
    '''
    Returns:
            V - a 2d array initialized as 0
            R - array containing rewards for each state
            P - array denoting equiprobable random policy
    '''
    V = np.zeros([4,4])
    R = np.array([[0,-10,-10,-10],
                  [-1,-1,-1,-1],
                  [-10,-10,-10,-1],
                  [-1,-1,-1,-1]]
                )
    # L - left, R - right, U - up, D - down
    P = [[[],['L','R','D','U'],['L','R','D','U'],['L','D','R','U']],
         [['U','R','D','L'],['U','R','L','D'],['U','R','L','D'],['U','L','D','R']],
         [['U','R','D','L'],['U','R','L','D'],['U','R','L','D'],['U','L','D','R']],
         [['U','R','L','D'],['L','R','U','D'],['L','R','U','D'],['L','U','R','D']]]
    return V,R,P

In [10]:
# Function to implement Value iteration
def value_iteration(V,P):
    '''
    Arguments:
            V - 2d array initialized to 0
            P - equiprobable random policy
    Returns:
            P - optimal policy
    '''
    num_iterations = 0
    # Loop for convergence of value function
    while num_iterations < 1000:
        V1 = np.zeros([4,4])
        for i in range(4):
            for j in range(4):
                # Evaluate value function using DP methods
                if (i == 0 and j == 0):
                    continue
                elif i == 3 and j == 3:
                    V1[i][j] = max(0.25*(R[i][j] + V[i][j-1]),0.25*(R[i][j] + V[i-1][j]))
                elif i == 0 and j == 3:
                    V1[i][j] = max(0.25*(R[i][j] + V[i][j-1]),0.25*(R[i][j] + V[i+1][j]))
                elif i == 3 and j == 0:
                    V1[i][j] = max(0.25*(R[i][j] + V[i-1][j]),0.25*(R[i][j] + V[i][j+1]))
                elif i == 0:
                    V1[i][j] = max(0.25*(R[i][j] + V[i][j-1]),0.25*(R[i][j] + V[i][j+1]),0.25*(R[i][j] + V[i+1][j]))
                elif i == 3:
                    V1[i][j] = max(0.25*(R[i][j] + V[i][j-1]),0.25*(R[i][j] + V[i][j+1]),0.25*(R[i][j] + V[i-1][j]))
                elif j == 0:
                    V1[i][j] = max(0.25*(R[i][j] + V[i][j+1]),0.25*(R[i][j] + V[i-1][j]),0.25*(R[i][j] + V[i+1][j]))
                elif j == 3:
                    V1[i][j] = max(0.25*(R[i][j] + V[i][j-1]),0.25*(R[i][j] + V[i-1][j]),0.25*(R[i][j] + V[i+1][j]))
                else:
                    V1[i][j] = max(0.25*(R[i][j] + V[i-1][j]),0.25*(R[i][j] + V[i+1][j]),0.25*(R[i][j] + V[i][j-1]),0.25*(R[i][j]+V[i][j+1]))
        
        V = V1
        num_iterations += 1
        
    Actions = [[0,-1,'L'],[-1,0,'U'],[0,1,'R'],[1,0,'D']]
    
    # Find optimal policy using precomputed values of all states   
    for i in range(4):
        for j in range(4):
            if (i == 0 and j == 0):
                continue
            optimal_policy = []
            max_val = -1e10
            for a in Actions:
                x = i + a[0]
                y = j + a[1]
                if x in range(4) and y in range(4): 
                    val = R[i][j] + V[x][y]
                    if val > max_val:
                        optimal_policy = [a[2]]
                        max_val = val
                    elif val == max_val:
                        optimal_policy.append(a[2])
            P[i][j] = optimal_policy
    return P

In [13]:
V,R,P = initialization()
P = value_iteration(V,P)
# Print optimal policy
print("The optimal policy is: ")
for row in P:
    print(row)

The optimal policy is: 
[[], ['L'], ['D'], ['D']]
[['U'], ['L'], ['L'], ['L']]
[['U'], ['U'], ['U'], ['U']]
[['R'], ['R'], ['R'], ['U']]
