## General

In [1]:
# General imports
import sys
import operator
import numpy as np

np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)}, threshold=sys.maxsize)

# Global variables
GOAL = (2,6)
START = (2,0)
WORLD = -1. * np.ones((5,7))
OBSTACLES = [(0,3),(0,4),(0,5),(0,6),(1,6),(2,2),(2,4),(3,1),(3,2),(3,5),(4,3),(4,4),(4,5)]
    
# Populate world
WORLD[GOAL] = 100

for obstacle in OBSTACLES:
    WORLD[obstacle] = -25

print(WORLD)

[[-1.0 -1.0 -1.0 -25.0 -25.0 -25.0 -25.0]
 [-1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -25.0]
 [-1.0 -1.0 -25.0 -1.0 -25.0 -1.0 100.0]
 [-1.0 -25.0 -25.0 -1.0 -1.0 -25.0 -1.0]
 [-1.0 -1.0 -1.0 -25.0 -25.0 -25.0 -1.0]]


## Task 5.1

In [20]:
ALPHA = 0.9
GAMMA = 0.8

# Move
def move(state, action):
    new_state = state.copy()
    
    # Right
    if action == "r":
        new_state["x"] += 1
    # Lower right
    elif action == "dr":
        new_state["x"] += 1
        new_state["y"] += 1
    # Down
    elif action == "d":
        new_state["y"] += 1
    # Lower left
    elif action == "dl":
        new_state["x"] -= 1
        new_state["y"] += 1
    # Left
    elif action == "l":
        new_state["x"] -= 1
    # Upper left
    elif action == "ul":
        new_state["x"] -= 1
        new_state["y"] -= 1
    # Up
    elif action == "u":
        new_state["y"] -= 1
    # Upper right
    if action == "ur":
        new_state["x"] += 1
        new_state["y"] -= 1

    return new_state

# Check if state out of bound
# and returns truncated state
def truncate(state):
    # Truncated state
    new_state = state.copy()
    
    # Check if out of bounds
    if new_state['y'] < 0:
        new_state['y'] += 1
        
    if new_state['y'] >= WORLD.shape[0]:
        new_state['y'] -= 1
            
    if new_state['x'] < 0:
        new_state['x'] += 1
        
    if new_state['x'] >= WORLD.shape[1]:
        new_state['x'] -= 1
    
    return new_state

# Check if state terminal
def terminal(state):
    return True if (state['y'], state['x']) == GOAL or \
                   (state['y'], state['x']) in OBSTACLES \
                else False

# Step function
def step(state, action):
    # Returns
    done = None
    reward = None
    
    # Move based on action
    new_state = move(state, action)
    
    # Check that state is valid
    new_state = truncate(new_state)
    
    # Check if state is terminal
    done = terminal(new_state)
    
    # Get reward for new state
    reward = WORLD[new_state['y'], new_state['x']]
    
    return (new_state, reward, done)


ACTIONS = {
    'ur': [('ur', 0.8), ('u', 0.1), ('r', 0.1)],
    'r': [('r', 0.8), ('ur', 0.1), ('dr', 0.1)], 
    'dr': [('dr', 0.8), ('r', 0.1), ('d', 0.1)]
}

def generate_action():
    action = None
    prob = np.random.uniform(0.0, 1.0)
    
    # Apply policy
    if prob < 0.5:
        action = 'ur' if prob > 0.25 else 'dr'
    else:
        action = 'r'
        
    # Apply non-det. action
    prob = np.random.uniform(0.0, 1.0)
    if prob < ACTIONS[action][0][1]:
        action = ACTIONS[action][0][0]
    else:
        action = (ACTIONS[action][1][0] 
                  if prob > ACTIONS[action][1][1] 
                  else ACTIONS[action][2][0])
    return action


def TD_eval(loops=1000):
    V = np.zeros_like(WORLD)

    for count in range(loops): 
        s = {'y': START[0], 'x': START[1]}

        # Episode loop
        while True:
            action = generate_action()
            n_s, reward, done = step(s, action)
                        
            V[s['y'], s['x']] = (V[s['y'], s['x']] + ALPHA * 
                                (reward + GAMMA*V[n_s['y'], n_s['x']]-V[s['y'], s['x']]))
            s = n_s
            
            if done:
                break
    return V

In [27]:
V = TD_eval()
V

array([[-17.6, -15.2, -20.8, 0.0, 0.0, 0.0, 0.0],
       [-17.5, -20.4, -7.9, 43.5, -5.1, -13.6, 0.0],
       [-20.9, -21.3, 0.0, -23.1, 0.0, -24.0, 0.0],
       [0.0, 0.0, 0.0, 0.0, -24.1, 0.0, 99.9],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.5]])