In [3]:
# Imports
import sys
import numpy as np

# Print options
np.set_printoptions(threshold=sys.maxsize)

# Sanity check data
grid_world = np.load('data/gridworld.npy')
solution_task31 = np.load('data/stateValuesTask1.npy')

In [23]:
# Global variables
DISCOUNT = 0.9
THRESHOLD = 0.007
GOAL_S = {'x':8,'y':3}
actions = [('r', 0.625, u'\u2192'), ('l', 0.125, u'\u2190'), ('u', 0.125, u'\u2191'), ('d', 0.125, u'\u2193')]
actions_diag = [('r', 0.5625, u'\u2192'), ('l', 0.0625, u'\u2190'), ('u', 0.0625, u'\u2191'), ('d', 0.0625, u'\u2193'), 
                ('lu', 0.0625, u'\u2196'), ('ru', 0.0625, u'\u2197'), ('rd', 0.0625, u'\u2198'), ('ld', 0.0625, u'\u2199')]

# Common routines
def is_outside(state):
    return ((state['y'] < 0 or state['y'] >= grid_world.shape[0]) 
              or (state['x'] < 0 or state['x'] >= grid_world.shape[1]))

def terminal_state(state, goal_inc=True):
    # Entered a goal state
    if goal_inc and state == GOAL_S:
        return True
    # Exeeded boundary limits
    elif is_outside(state):
        return True
    # Entered a cell X
    elif grid_world[state['y'], state['x']] == -20:
        return True

    # Not a terminal state
    else:
        return False

def get_reward(state):
    if is_outside(state):
        return -30;
    else:
        return grid_world[state['y'], state['x']]

def move(state, action, correction=True):
    state_c = state.copy()
    
    if action == 'l':
        state_c['x']  -= 1
    elif action == 'r':
        state_c['x']  += 1
    elif action == 'u':
        state_c['y']  -= 1
    elif action == 'd':
        state_c['y']  += 1
        
    if not terminal_state(state_c, goal_inc=False) or not correction:
        state['x'] = state_c['x']
        state['y'] = state_c['y']
        
    return get_reward(state_c)

def extended_move(state, action, correction=True):
    state_c = state.copy()
    
    if action == 'l':
        state_c['x'] -= 1
    elif action == 'r':
        state_c['x'] += 1
    elif action == 'u':
        state_c['y'] -= 1
    elif action == 'd':
        state_c['y'] += 1
    elif action == 'lu':
        state_c['x'] -= 1
        state_c['y'] -= 1
    elif action == 'ru':
        state_c['x'] += 1
        state_c['y'] -= 1
    elif action == 'ld':
        state_c['x'] -= 1
        state_c['y'] += 1
    elif action == 'rd':
        state_c['x'] += 1
        state_c['y'] += 1
        
    if not terminal_state(state_c, goal_inc=False) or not correction:
        state['x'] = state_c['x']
        state['y'] = state_c['y']
        
    return get_reward(state_c)

def get_actions(action):
    if action == 'l':
        return ['l', 'lu', 'ld']
    if action == 'r':
        return ['r', 'ru', 'rd']
    if action == 'u':
        return ['u', 'ru', 'lu']
    if action == 'd':
        return ['d', 'dl', 'dr']
    if action == 'lu':
        return ['lu', 'l', 'u']
    if action == 'ld':
        return ['ld', 'l', 'd']
    if action == 'ru':
        return ['ru', 'r', 'u']
    if action == 'rd':
        return ['rd', 'r', 'd']

def non_deterministic_move(state, action, V):    
    # Transitions probabilities
    transitions = [0.8, 0.1, 0.1]
    
    # Rewards
    rewards = []
    
    # States values
    state_values = []
    
    # Get possible actions
    actions = get_actions(action)
    
    # Populate actions holders
    for action_ in actions:
        # Copy of state
        state_a = state.copy()
        
        # Populate lists
        rewards.append(extended_move(state_a, action_))
        state_values.append(V[state_a['y'], state_a['x']])
    
    return (transitions, rewards, state_values)
    
def compute_state_value_function(actions, diag_move=False, deterministic=True):
    # Value function matrix
    V = np.zeros_like(grid_world)
    
    # Delta value
    delta = np.inf
    
    while delta > THRESHOLD:
        # Reset delta
        delta  = 0

        # Copy of V
        V_copy = V.copy()
        
        # States loop
        for y in range(grid_world.shape[0]):
            for x in range(grid_world.shape[1]):
                # Current state
                state = {'x':x,'y':y}
                
                if terminal_state(state):
                    continue
        
                # Temporary value for the state
                V_prev = V[y,x]
                
                # Reset value
                V[y,x] = 0
                
                # Compute new value (diag move allowed)
                for action, probability, _ in actions:
                    state_a = state.copy()
                    
                    if (not deterministic):
                        trans, rewards, state_values = non_deterministic_move(state_a, action, V_copy)
                        for x in range(3):
                            V[y,x] += probability * (trans[x] * (rewards[x] + DISCOUNT * state_values[x]))
                    else:
                        reward = extended_move(state_a, action) if diag_move else move(state_a, action)
                        V[y,x] += probability * (reward + DISCOUNT * V_copy[state_a['y'], state_a['x']])
                
                # Compute new delta
                delta = max(delta, np.abs(V_prev - V[y,x]))
    return V

def compute_action_value_function(state_value_table, actions, diag_move=False):
    # State-Value and Action-Value
    # tables to be used and computed
    V = state_value_table.copy()
    policy = V.copy().astype(object)

    # Set goal cell value to inf
    V[GOAL_S['y'], GOAL_S['x']] = np.inf

    # Loop over every state
    for y in range(grid_world.shape[0]):
        for x in range(grid_world.shape[1]):
            # Current state
            state = {'x':x,'y':y}
            
            # State policy
            state_policy = ''

            # Check if state is Goal
            if state == GOAL_S:
                policy[state['y'],state['x']] = '@'
                continue
                
            # Check if terminal state
            if terminal_state(state, goal_inc=True):
                policy[state['y'],state['x']] = 'X'
                continue

            # Get the best action
            best_r_action = -np.inf
            for action, _, arrow in actions:
                state_a = state.copy()
                reward = extended_move(state_a, action, correction=False) if diag_move else move(state_a, action, correction=False)

                if terminal_state(state_a, goal_inc=False):
                    continue 

                a_move = V[state_a['y'], state_a['x']]
                best_r_action = a_move if a_move > best_r_action else best_r_action

            # Assign arrow(s) (it can be that multiple
            # states are sharing the same reward)
            for action, _, arrow in actions:
                state_a = state.copy()
                reward = extended_move(state_a, action) if diag_move else move(state_a, action)

                if terminal_state(state_a, goal_inc=False):
                    continue 

                if V[state_a['y'], state_a['x']] == best_r_action:
                    state_policy += arrow

            # Insert the arrow in the policy
            policy[state['y'],state['x']] = state_policy

    return policy

## Exercise 3.1

In [16]:
result_task31 = compute_state_value_function(actions)
diff = sum(abs(solution_task31 - result_task31).flatten())

print("Accumulative difference from the solution results: {}\n\n{}".format(diff, result_task31))

Accumulative difference from the solution results: 0.34249411007374775

[[ -55.72233031  -60.28711964  -76.2187774   -85.94227143  -95.17265171
  -102.06584518 -109.1497957  -128.03694587 -154.35820998]
 [ -70.23910801  -71.03512149  -82.42332396  -97.04567539 -113.44958832
     0.         -138.84246494    0.         -140.12747124]
 [-114.35530129    0.            0.            0.            0.
     0.         -156.42672419    0.          -71.57989163]
 [-113.49035193    0.          -67.70838784  -82.25522086 -104.2432133
  -131.62855066    0.           67.97547745    0.        ]
 [ -66.87419882  -61.7022072   -77.43326267    0.            0.
     0.          -51.56820765  -41.72566933  -63.11667048]
 [ -64.48159917  -63.86372895  -69.52992994  -77.89104824  -83.19067425
   -88.81991069  -95.51570805    0.         -145.95801304]
 [ -81.40779974    0.            0.            0.            0.
     0.            0.         -146.67829987 -167.44391259]
 [ -24.06403261  -19.82047396  -28.4

## Task 3.2

In [17]:
policy32 = compute_action_value_function(result_task31, actions)

In [18]:
print("Task 3.2 policy: \n\n", policy32)

Task 3.2 policy: 

 [['→' '←' '←' '←' '←' '←' '←' '←' '←']
 ['↑' '↑' '←' '←' '↑' 'X' '↑' 'X' '↓']
 ['↑' 'X' 'X' 'X' 'X' 'X' '↑' 'X' '↓']
 ['↓' 'X' '↓' '←' '←' '←' 'X' '→' '@']
 ['→' '↓' '←' 'X' 'X' 'X' '→' '↑' '↑']
 ['→' '↑' '←' '←' '←' '←' '↑' 'X' '↑']
 ['↓' 'X' 'X' 'X' 'X' 'X' 'X' '↓' '↑']
 ['→' '←' '←' '←' '←' '←' '←' '←' 'X']
 ['↑' '↑' '↑' '↑' 'X' '↑' '←' '←' 'X']]


## Task 3.3

In [19]:
result_task33 = compute_state_value_function(actions_diag, diag_move=True)
diff = sum(abs(result_task33 - result_task31).flatten())

print("Accumulative difference from the solution 3.2 results: {}\n\n{}".format(diff, result_task33))

Accumulative difference from the solution 3.2 results: 1024.1516773911771

[[ -78.85295654  -84.78663318 -100.82983448 -111.41432921 -122.39915622
  -132.93539283 -144.82161683 -170.07232622 -207.24509049]
 [ -87.24992879  -88.45634884  -98.81538951 -113.13610521 -129.96200609
     0.         -140.33734441    0.         -179.61116839]
 [-131.36156914    0.            0.            0.            0.
     0.         -104.20498146    0.          -90.89994106]
 [-126.45709436    0.          -67.15456507  -77.81474247  -94.73275553
  -112.8608242     0.           35.28098586    0.        ]
 [ -76.35713138  -70.98274204  -85.70573896    0.            0.
     0.          -60.32823267  -59.43878516  -78.48338783]
 [ -76.1775224   -75.05042911  -81.76856604  -89.43287221  -96.023486
   -99.67864854 -110.44750388    0.         -148.21199079]
 [ -91.26110919    0.            0.            0.            0.
     0.            0.         -147.62609185 -177.50236181]
 [ -41.96712871  -41.94574064  -49

In [20]:
policy33 = compute_action_value_function(result_task33, actions_diag, diag_move=True)

In [21]:
print("Task 3.3 policy: \n\n", policy33)

Task 3.3 policy: 

 [['→' '←' '←' '↙' '←' '←' '←' '↙' '←']
 ['↑' '↖' '↖' '←' '↖' 'X' '↓' 'X' '↓']
 ['↑' 'X' 'X' 'X' 'X' 'X' '↘' 'X' '↓']
 ['↘' 'X' '↙' '←' '←' '↘' 'X' '→' '@']
 ['→' '↗' '↑' 'X' 'X' 'X' '↗' '↗' '↑']
 ['↗' '↑' '↖' '←' '←' '↗' '↗' 'X' '↖']
 ['↘' 'X' 'X' 'X' 'X' 'X' 'X' '↖' '↙']
 ['→' '←' '←' '←' '←' '←' '←' '←' 'X']
 ['↗' '↑' '↖' '↖' 'X' '↖' '↖' '↖' 'X']]


## Task 3.4

In [None]:
result_task34 = compute_state_value_function(actions_diag, diag_move=True, deterministic=False)