In [158]:
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)

solution_task1 = np.load('data/stateValuesTask1.npy')
grid_world = np.load('data/gridworld.npy')

## Exercise 3.1

In [159]:
GOAL_S = {'x':8,'y':3}

def is_outside(state):
    return ((state['y'] < 0 or state['y'] >= grid_world.shape[0]) 
              or (state['x'] < 0 or state['x'] >= grid_world.shape[1]))

def terminal_state(state, goal_inc=True):
    # Entered a goal state
    if goal_inc and state == GOAL_S:
        return True
    # Exeeded boundary limits
    elif is_outside(state):
        return True
    # Entered a cell X
    elif grid_world[state['y'], state['x']] == -20:
        return True

    # Not a terminal state
    else:
        return False

def get_reward(state):
    if is_outside(state):
        return -30;
    else:
        return grid_world[state['y'], state['x']]

def make_move(state, action, correction=True):
    state_c = state.copy()
    
    if action == 'l':
        state_c['x']  -= 1
    elif action == 'r':
        state_c['x']  += 1
    elif action == 'u':
        state_c['y']  -= 1
    elif action == 'd':
        state_c['y']  += 1
        
    if not terminal_state(state_c, goal_inc=False) or not correction:
        state['x'] = state_c['x']
        state['y'] = state_c['y']
        
    return get_reward(state_c)


In [160]:
THRESHOLD = 0.007
actions = [('r', 0.625, u'\u2192'), ('l', 0.125, u'\u2190'), ('u', 0.125, u'\u2191'), ('d', 0.125, u'\u2193')]

def compute_V(disc=0.9):
    # Value function matrix
    V = np.zeros_like(grid_world)
    # Delta value
    delta = np.inf
    
    while delta > THRESHOLD:
        # Reset delta
        delta  = 0

        # Copy of V
        V_copy = V.copy()
        
        # States loop
        for y in range(grid_world.shape[0]):
            for x in range(grid_world.shape[1]):
                # Current state
                state = {'x':x,'y':y}
                
                if terminal_state(state):
                    continue
                                
                # Temporary value for the state
                V_prev = V[y,x]
                
                # Compute new value
                V[y,x] = 0
                for action, probability, _ in actions:
                    state_a = state.copy()
                    reward = make_move(state_a, action)
                    V[y,x] += probability * (reward + disc * V_copy[state_a['y'], state_a['x']])
                
                # Compute new delta
                delta = max(delta, np.abs(V_prev - V[y,x]))
    return V

In [161]:
result_task1 = compute_V()
diff = sum(abs(solution_task1 - result_task1).flatten())

print("Accumulative difference from the solution results: {}\n\n{}".format(diff, result_task1))

Accumulative difference from the solution results: 0.34249411007374775

[[ -55.72233031  -60.28711964  -76.2187774   -85.94227143  -95.17265171
  -102.06584518 -109.1497957  -128.03694587 -154.35820998]
 [ -70.23910801  -71.03512149  -82.42332396  -97.04567539 -113.44958832
     0.         -138.84246494    0.         -140.12747124]
 [-114.35530129    0.            0.            0.            0.
     0.         -156.42672419    0.          -71.57989163]
 [-113.49035193    0.          -67.70838784  -82.25522086 -104.2432133
  -131.62855066    0.           67.97547745    0.        ]
 [ -66.87419882  -61.7022072   -77.43326267    0.            0.
     0.          -51.56820765  -41.72566933  -63.11667048]
 [ -64.48159917  -63.86372895  -69.52992994  -77.89104824  -83.19067425
   -88.81991069  -95.51570805    0.         -145.95801304]
 [ -81.40779974    0.            0.            0.            0.
     0.            0.         -146.67829987 -167.44391259]
 [ -24.06403261  -19.82047396  -28.4

## Task 3.2

In [162]:
V = result_task1.copy()
policy = V.copy().astype(object)

# set goal cell value to inf
V[GOAL_S['y'], GOAL_S['x']] = np.inf

# Loop per every states
for y in range(grid_world.shape[0]):
    for x in range(grid_world.shape[1]):
        state = {'x':x,'y':y}
        state_policy = ''
               
        # Check if Goal state
        if state == GOAL_S:
            policy[state['y'],state['x']] = '@'
            continue
        # Check if terminal state
        if terminal_state(state, goal_inc=True):
            policy[state['y'],state['x']] = 'X'
            continue
            
        # Get the most rewarding action
        best_r_action = -np.inf
        for action, _, arrow in actions:
            state_a = state.copy()
            reward = make_move(state_a, action, correction=False)
            
            if terminal_state(state_a, goal_inc=False):
                continue 
            
            a_move = V[state_a['y'], state_a['x']]
            best_r_action = a_move if a_move > best_r_action else best_r_action
           
        # Assign arrow(s) (it can be that multiple
        # states are sharing the same reward)
        for action, _, arrow in actions:
            state_a = state.copy()
            reward = make_move(state_a, action)
            
            if terminal_state(state_a, goal_inc=False):
                continue 
            
            if V[state_a['y'], state_a['x']] == best_r_action:
                state_policy += arrow
        
        # Insert the arrow in the policy
        policy[state['y'],state['x']] = state_policy

In [163]:
policy

array([['→', '←', '←', '←', '←', '←', '←', '←', '←'],
       ['↑', '↑', '←', '←', '↑', 'X', '↑', 'X', '↓'],
       ['↑', 'X', 'X', 'X', 'X', 'X', '↑', 'X', '↓'],
       ['↓', 'X', '↓', '←', '←', '←', 'X', '→', '@'],
       ['→', '↓', '←', 'X', 'X', 'X', '→', '↑', '↑'],
       ['→', '↑', '←', '←', '←', '←', '↑', 'X', '↑'],
       ['↓', 'X', 'X', 'X', 'X', 'X', 'X', '↓', '↑'],
       ['→', '←', '←', '←', '←', '←', '←', '←', 'X'],
       ['↑', '↑', '↑', '↑', 'X', '↑', '←', '←', 'X']], dtype=object)