In [1]:
import numpy as np

In [2]:
# Initialize states and random policy

def initialization():
    '''
    Returns:
            V - 2d array initialized to 0
            P - equiprobable random policy
    '''
    V = np.zeros([4,4])
    P = [[[],['L','R','D'],['L','R','D'],['L','D']],
         [['U','R','D'],['U','R','L','D'],['U','R','L','D'],['U','L','D']],
         [['U','R','D'],['U','R','L','D'],['U','R','L','D'],['U','L','D']],
         [['U','R'],['L','R','U'],['L','R','U'],[]]]
    return V,P

In [17]:
# Function to implement Value iteration
def value_iteration(V,P):
    '''
    Arguments:
            V - 2d array initialized to 0
            P - equiprobable random policy
    Returns:
            P - optimal policy
    '''
    num_iterations = 0
    # Loop for convergence of value function
    while num_iterations < 1000:
        V1 = np.zeros([4,4])
        for i in range(4):
            for j in range(4):
                # Evaluate value function using DP methods
                if (i == 0 and j == 0) or (i == 3 and j == 3):
                    continue
                elif i == 0 and j == 3:
                    V1[i][j] = max(0.25*(-1 + V[i][j-1]),0.25*(-1 + V[i+1][j]))
                elif i == 3 and j == 0:
                    V1[i][j] = max(0.25*(-1 + V[i-1][j]),0.25*(-1 + V[i][j+1]))
                elif i == 0:
                    V1[i][j] = max(0.25*(-1 + V[i][j-1]),0.25*(-1 + V[i][j+1]),0.25*(-1 + V[i+1][j]))
                elif i == 3:
                    V1[i][j] = max(0.25*(-1 + V[i][j-1]),0.25*(-1 + V[i][j+1]),0.25*(-1 + V[i-1][j]))
                elif j == 0:
                    V1[i][j] = max(0.25*(-1 + V[i][j+1]),0.25*(-1 + V[i-1][j]),0.25*(-1 + V[i+1][j]))
                elif j == 3:
                    V1[i][j] = max(0.25*(-1 + V[i][j-1]),0.25*(-1 + V[i-1][j]),0.25*(-1 + V[i+1][j]))
                else:
                    V1[i][j] = max(0.25*(-1 + V[i-1][j]),0.25*(-1 + V[i+1][j]),0.25*(-1 + V[i][j-1]),0.25*(-1+V[i][j+1]))
        
        V = V1
        num_iterations += 1
        
    Actions = [[0,-1,'L'],[-1,0,'U'],[0,1,'R'],[1,0,'D']]
    
    # Find optimal policy using precomputed values of all states   
    for i in range(4):
        for j in range(4):
            if (i == 0 and j == 0) or (i == 3 and j == 3):
                continue
            optimal_policy = []
            max_val = -1e10
            for a in Actions:
                x = i + a[0]
                y = j + a[1]
                if x in range(4) and y in range(4): 
                    val = -1 + V[x][y]
                    if val > max_val:
                        optimal_policy = [a[2]]
                        max_val = val
                    elif val == max_val:
                        optimal_policy.append(a[2])
            P[i][j] = optimal_policy
    return P

In [19]:
V,P = initialization()
P = value_iteration(V,P)
print("Optimal policy function is: ")
for row in P:
    print(row)

Optimal value function is: 
[[ 0.       -0.25     -0.3125   -0.328125]
 [-0.25     -0.3125   -0.328125 -0.3125  ]
 [-0.3125   -0.328125 -0.3125   -0.25    ]
 [-0.328125 -0.3125   -0.25      0.      ]]
[[[], ['L'], ['L'], ['L', 'D']], [['U'], ['L', 'U'], ['L', 'U', 'R', 'D'], ['D']], [['U'], ['L', 'U', 'R', 'D'], ['R', 'D'], ['D']], [['U', 'R'], ['R'], ['R'], []]]
