In [1]:
# Problem Statement 1

wall = [(1,1)]
terminal_states = ((1,3),(2,3))

# # non - deterministic action (equally probable)
# action_probability = {'L':0.25,'R':0.25,'U':0.25,'D':0.25}

# environment action corresponding to Agent if it does not follow the desired direction (i.e follow perpendicular direction to desired one)
environment_left = {'L':'D','R':'U','U':'L','D':'R'}
environment_right = {'L':'U','R':'D','U':'R','D':'L'}

#check validity of the cell
def is_valid(i,j):
    return (i,j) not in wall and i >= 0 and i < 3 and j >= 0 and j < 4

#print matrix after convergence 
def print_values(V):
  for i in range(2,-1,-1):
    print(" ")
    for j in range(4):
      v = V[i][j]
      print(" %.2f|" % v, end="")
    print("")

#take action
def transition(action,i,j):
    if action == 'L':
        return (i,j-1)
    elif action == 'R':
        return (i,j+1)
    elif action == 'U':
        return (i+1,j)
    elif action == 'D':
        return (i-1,j)   
    else:
        return (-1,-1)

def value_function(i,j,reward,reward_matrix,discount_factor=1):
    value = 0
    for action in ['L','R','U','D']:
        # desired action with 0.8 probability
        state_x,state_y = transition(action,i,j)
        if is_valid(state_x,state_y):
            desired_action_value = (reward_matrix[state_x][state_y] + discount_factor*V_pie[state_x][state_y])
        else:
            desired_action_value = (reward_matrix[i][j] + discount_factor*V_pie[i][j])
        
        # environment action with 0.1 probability
        state_x,state_y = transition(environment_left[action],i,j)
        if is_valid(state_x,state_y):
            env_action_left_value = (reward_matrix[state_x][state_y] + discount_factor*V_pie[state_x][state_y])
        else:
            env_action_left_value = (reward_matrix[i][j] + discount_factor*V_pie[i][j])
        
        # environment action with 0.1 probability 
        state_x,state_y = transition(environment_right[action],i,j)
        if is_valid(state_x,state_y):
            env_action_right_value = (reward_matrix[state_x][state_y] + discount_factor*V_pie[state_x][state_y])
        else:
            env_action_right_value = (reward_matrix[i][j] + discount_factor*V_pie[i][j])
        
        value_to_action = desired_action_value*0.8+env_action_left_value*0.1+env_action_right_value*0.1        

        value += value_to_action*0.25 # # non - deterministic action (equally probable)

    return value

# iterative policy evaluation
def iterative_policy_evaluation(iter,epsilon,reward,reward_matrix,V_pie):
    while True:
        delta = 0
        for i in range(3):
            for j in range(4):
                state = (i,j)
                if state in terminal_states or state in wall:  # continue if encounter terminal state or wall
                    continue
                v = V_pie[i][j]
                V_pie[i][j] = value_function(i,j,reward,reward_matrix)
                delta = max(delta,abs(v-V_pie[i][j]))
        iter += 1
        if delta < epsilon:
            print(f"Number of iterations to converge = {iter}")
            break 
    print_values(V_pie)

# initialize the reward matrix with given reward value except the terminal states
def update_reward_matrix(reward):
    reward_matrix = [[reward for _ in range(4)] for _ in range(3)]
    reward_matrix[2][3] = 1
    reward_matrix[1][3] = -1
    return reward_matrix

# initialize V_pie with all zeroes at start
def initialize_V_pie():
    V_pie = [[0 for _ in range(4)]for _ in range(3)]
    return V_pie

In [2]:
rewards = [-0.04,-2,0.1,0.02,1]
epsilon = 1e-8
print("Value Functions corresponding to optimal policy\n")
for reward in rewards:
    print(f"For r(S) : {reward}")
    reward_matrix = update_reward_matrix(reward)
    V_pie = initialize_V_pie()
    iterative_policy_evaluation(0,epsilon,reward,reward_matrix,V_pie)
    print("\n")

Value Functions corresponding to optimal policy

For r(S) : -0.04
Number of iterations to converge = 312
 
 -1.23| -0.83| -0.28| 0.00|
 
 -1.47| 0.00| -0.87| 0.00|
 
 -1.55| -1.47| -1.22| -1.17|


For r(S) : -2
Number of iterations to converge = 384
 
 -59.71| -46.01| -24.32| 0.00|
 
 -65.41| 0.00| -21.94| 0.00|
 
 -63.10| -52.80| -34.49| -20.75|


For r(S) : 0.1
Number of iterations to converge = 324
 
 2.95| 2.39| 1.44| 0.00|
 
 3.10| 0.00| 0.63| 0.00|
 
 2.85| 2.20| 1.15| 0.23|


For r(S) : 0.02
Number of iterations to converge = 284
 
 0.56| 0.55| 0.46| 0.00|
 
 0.49| 0.00| -0.23| 0.00|
 
 0.34| 0.11| -0.20| -0.57|


For r(S) : 1
Number of iterations to converge = 370
 
 29.80| 23.14| 12.48| 0.00|
 
 32.46| 0.00| 10.30| 0.00|
 
 31.11| 25.77| 16.43| 9.22|


