In [74]:
import numpy as np

In [75]:
def initialization():
    '''
    Returns:
            V - a 2d array initialized as 0
            R - array containing rewards for each state
            P - array denoting equiprobable random policy
    '''
    V = np.zeros([4,4])
    R = np.array([[0,-10,-10,-10],
                  [-1,-1,-1,-1],
                  [-10,-10,-10,-1],
                  [-1,-1,-1,-1]]
                )
    P = [[[],['L','R','D','U'],['L','R','D','U'],['L','D','R','U']],
         [['U','R','D','L'],['U','R','L','D'],['U','R','L','D'],['U','L','D','R']],
         [['U','R','D','L'],['U','R','L','D'],['U','R','L','D'],['U','L','D','R']],
         [['U','R','L','D'],['L','R','U','D'],['L','R','U','D'],['L','U','R','D']]]
    return V,R,P

In [76]:
def policy_evaluation(V,R,P):
    '''
    Arguments:
            V is a 2d array initialized as 0
            R is the array containing rewards for each state
            P is the policy taken by the agent
    Returns:
            V - The value function calculated for the policy P
    '''
    num_iterations = 0
    while num_iterations < 1000:
        V1 = np.zeros([4,4])
        for i in range(4):
            for j in range(4):
                if i == 0 and j == 0:
                    continue
                for a in P[i][j]:
                    if a == 'L':
                        if j-1 < 0:    
                            V1[i][j] += 0.25*(R[i][j] + V[i][j])
                        else:
                            V1[i][j] += 0.25*(R[i][j] + V[i][j-1])
                    elif a == 'R':
                        if j + 1 >= 4:
                            V1[i][j] += 0.25*(R[i][j] + V[i][j])
                        else:
                            V1[i][j] += 0.25*(R[i][j] + V[i][j+1])
                    elif a == 'U':
                        if i - 1 < 0:
                            V1[i][j] += 0.25*(R[i][j] + V[i][j])
                        else:
                            V1[i][j] += 0.25*(R[i][j] + V[i-1][j])
                    elif a == 'D':
                        if i + 1 >= 4:
                            V1[i][j] += 0.25*(R[i][j] + V[i][j])
                        else:
                            V1[i][j] += 0.25*(R[i][j] + V[i+1][j])
        
        V = V1
        num_iterations += 1
    return V

In [77]:
def policy_improvement(V,R,P,P1):
    '''
    Arguments: 
            V is the value function
            R is the array containing rewards for each state
            P is the equiprobable random policy
            P1 is the previous optimal policy
    Returns:
            P - Optimal policy after performing policy improvement
            policy_stable - bool variable denoting if P = P1
    '''
    policy_stable = True
    # Iterate over all states to find the optimal policy
    for i in range(4):
        for j in range(4):
            # Skip terminal state
            if i == 0 and j == 0:
                continue
            optimal_policy = []
            old_policy = P1[i][j]
            max_val = -1e10
            # Actions that take the agent out of the grid are skipped
            for a in P[i][j]:
                if a == 'L':
                    if j - 1 < 0:
                        continue
                    val = R[i][j] + V[i][j-1]
                    if val > max_val:
                        max_val = val
                        optimal_policy = ['L']
                    elif val == max_val:
                        optimal_policy.append('L')
                elif a == 'R':
                    if j + 1 >= 4:
                        continue
                    val = R[i][j] + V[i][j+1]
                    if val > max_val:
                        max_val = val
                        optimal_policy = ['R']
                    elif val == max_val:
                        optimal_policy.append('R')
                elif a == 'U':
                    if i - 1 < 0:
                        continue
                    val = R[i][j] + V[i-1][j]
                    if val > max_val:
                        max_val = val
                        optimal_policy = ['U']
                    elif val == max_val:
                        optimal_policy.append('U')
                if a == 'D':
                    if i + 1 >= 4:
                        continue
                    val = R[i][j] + V[i+1][j]
                    if val > max_val:
                        max_val = val
                        optimal_policy = ['D']
                    elif val == max_val:
                        optimal_policy.append('D')
                        
            # Check if policy for state V[i][j] has changed
            if old_policy != optimal_policy:
                policy_stable = False
                
            P[i][j] = optimal_policy
            
    return P,policy_stable

In [78]:
V,R,P1 = initialization()

# Perform policy iteration until the policy doesn't change for any state in an iteration
while True:
    # Equiprobable random policy
    P = ([[[],['L','R','D','U'],['L','R','D','U'],['L','D','R','U']],
         [['U','R','D','L'],['U','R','L','D'],['U','R','L','D'],['U','L','D','R']],
         [['U','R','D','L'],['U','R','L','D'],['U','R','L','D'],['U','L','D','R']],
         [['U','R','L','D'],['L','R','U','D'],['L','R','U','D'],['L','U','R','D']]])

    P1,policy_stable = policy_improvement(V,R,P,P1)
    # If policy stable is true, the policy hasn't changed for any state in an iteration
    if policy_stable:
        break
    V = policy_evaluation(V,R,P1)

# Print optimal policy
# Each cell denotes the optimal action that needs to be taken from that state
for row in P1:
    print(row)

[[], ['L'], ['D'], ['D']]
[['U'], ['L'], ['L'], ['L']]
[['U'], ['U'], ['U'], ['U']]
[['R'], ['R'], ['R'], ['U']]
