## General

In [5]:
# General imports
import sys
import operator
import numpy as np

np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)}, threshold=sys.maxsize)

# Global variables
EPS = 0.1
ALPHA = 0.9
GAMMA = 0.8
GOAL = (2,6)
START = (2,0)
WORLD = -1. * np.ones((5,7))
OBSTACLES = [(0,3),(0,4),(0,5),(0,6),(1,6),(2,2),(2,4),(3,1),(3,2),(3,5),(4,3),(4,4),(4,5)]
ACTIONS = {
    'ur': [('ur', 0.8), ('u', 0.1), ('r', 0.1)],
    'r': [('r', 0.8), ('ur', 0.1), ('dr', 0.1)], 
    'dr': [('dr', 0.8), ('r', 0.1), ('d', 0.1)]
}

# Populate world
WORLD[GOAL] = 100

for obstacle in OBSTACLES:
    WORLD[obstacle] = -25

print(WORLD)

[[-1.0 -1.0 -1.0 -25.0 -25.0 -25.0 -25.0]
 [-1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -25.0]
 [-1.0 -1.0 -25.0 -1.0 -25.0 -1.0 100.0]
 [-1.0 -25.0 -25.0 -1.0 -1.0 -25.0 -1.0]
 [-1.0 -1.0 -1.0 -25.0 -25.0 -25.0 -1.0]]


## Task 5.1

In [6]:
# Move
def move(state, action):
    new_state = state.copy()
    
    # Right
    if action == "r":
        new_state["x"] += 1
    # Lower right
    elif action == "dr":
        new_state["x"] += 1
        new_state["y"] += 1
    # Down
    elif action == "d":
        new_state["y"] += 1
    # Lower left
    elif action == "dl":
        new_state["x"] -= 1
        new_state["y"] += 1
    # Left
    elif action == "l":
        new_state["x"] -= 1
    # Upper left
    elif action == "ul":
        new_state["x"] -= 1
        new_state["y"] -= 1
    # Up
    elif action == "u":
        new_state["y"] -= 1
    # Upper right
    if action == "ur":
        new_state["x"] += 1
        new_state["y"] -= 1

    return new_state

# Check if state out of bound
# and returns truncated state
def truncate(state):
    # Truncated state
    new_state = state.copy()
    
    # Check if out of bounds
    if new_state['y'] < 0:
        new_state['y'] += 1
        
    if new_state['y'] >= WORLD.shape[0]:
        new_state['y'] -= 1
            
    if new_state['x'] < 0:
        new_state['x'] += 1
        
    if new_state['x'] >= WORLD.shape[1]:
        new_state['x'] -= 1
    
    return new_state

# Check if state terminal
def terminal(state):
    return True if (state['y'], state['x']) == GOAL or \
                   (state['y'], state['x']) in OBSTACLES \
                else False

# Step function
def step(state, action):
    # Returns
    done = None
    reward = None
    
    # Move based on action
    new_state = move(state, action)
    
    # Check that state is valid
    new_state = truncate(new_state)
    
    # Check if state is terminal
    done = terminal(new_state)
    
    # Get reward for new state
    reward = WORLD[new_state['y'], new_state['x']]
    
    return (new_state, reward, done)

def generate_action():
    action = None
    prob = np.random.uniform(0.0, 1.0)
    
    # Apply policy
    if prob < 0.5:
        action = 'ur' if prob > 0.25 else 'dr'
    else:
        action = 'r'
        
    # Apply non-det. action
    prob = np.random.uniform(0.0, 1.0)
    if prob < ACTIONS[action][0][1]:
        action = ACTIONS[action][0][0]
    else:
        action = (ACTIONS[action][1][0] 
                  if prob > ACTIONS[action][1][1] 
                  else ACTIONS[action][2][0])
    return action


def TD_eval(loops=1000):
    V = np.zeros_like(WORLD)

    for count in range(loops): 
        s = {'y': START[0], 'x': START[1]}

        # Episode loop
        while True:
            action = generate_action()
            n_s, reward, done = step(s, action)
                        
            V[s['y'], s['x']] = (V[s['y'], s['x']] + ALPHA * 
                                (reward + GAMMA*V[n_s['y'], n_s['x']]-V[s['y'], s['x']]))
            s = n_s
            
            if done:
                break
    return V

In [7]:
V = TD_eval()
V

array([[-5.8, -20.8, -25.0, 0.0, 0.0, 0.0, 0.0],
       [-15.2, -21.0, -24.3, -24.8, 68.2, -13.7, 0.0],
       [-18.7, -24.8, 0.0, -19.3, 0.0, -12.8, 0.0],
       [0.0, 0.0, 0.0, 0.0, -17.8, 0.0, 96.4],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])

# Task 5.2

In [83]:
def which_arrow(action):
    if action == 'r':
        return u'\u2192'
    elif action == 'l':
        return u'\u2190'
    elif action == 'u':
        return u'\u2191'
    elif action == 'd':
        return u'\u2193'
    elif action == 'ul':
        return u'\u2196'
    elif action == 'ur':
        return u'\u2197'
    elif action == 'dr':
        return u'\u2198'
    else:
        return u'\u2199'

def eps_greedy(Q, s):
    # Action from policy
    action = None
    
    # Encode state
    key_s = ','.join(str(x) for x in s.values())
        
    if key_s in Q.keys() and Q[key_s]:
        if np.random.uniform(0.0, 1.0) > EPS: 
            action = max(Q[key_s].items(), key=operator.itemgetter(1))[0]
        else:
            action = generate_action()
    else:
        action = generate_action()
    
    # Create entry if not existent
    if key_s not in Q.keys():
        Q[key_s] = {}
    if action not in Q[key_s].keys():
        Q[key_s][action] = 0
    
    return action

def max_action(Q, s, value=True):
    # Action from policy
    action = None
    
    # Encode state
    key_s = ','.join(str(x) for x in s.values())
        
    if key_s in Q.keys() and Q[key_s]:
        # Get max action
        action = max(Q[key_s].items(), key=operator.itemgetter(1))[0]
        
        if value:
            return Q[key_s][action]
        else:
            return action
    else:
        return 0

def show_policy(Q):
    # Policy
    P = WORLD.copy().astype(object)
    
    # Loop over all states
    for key in Q.keys():
        # Get state representations
        state_list = key.split(",")
        state = (int(state_list[0]), int(state_list[1]))
        state_dict = {'y': int(state_list[0]), 'x': int(state_list[1])}
        
        # Get max action for state
        action = max_action(Q, state_dict, value=False)
        
        # Insert policy action
        P[state] = which_arrow(action)
    
    return P

# Q-Learning
def q_learning(loops=10000):
    # Action value function
    Q = {}
    
    for count in range(loops):
        # Initial state
        s = {'y': START[0], 'x': START[1]}

        # Episode loop
        while True:
            action = None
            
            # EPS greedy
            action = eps_greedy(Q, s)

            # Get next state, reward and if terminal    
            n_s, r, done = step(s, action)
            
            # Encode state
            key_s = ','.join(str(x) for x in s.values())
            
            # Update Q
            Q[key_s][action] = (Q[key_s][action] +
                              ALPHA * (r + GAMMA * max_action(Q, n_s) - Q[key_s][action]))
            
            # Update state
            s = n_s
            
            if done:
                break
    return Q

In [84]:
Q = q_learning()

In [85]:
print("Q-Learning policy: \n\n", show_policy(Q))

Q-Learning policy: 

 [[-1.0 '↘' '↘' -25.0 -25.0 -25.0 -25.0]
 ['↗' '→' '↘' '→' '↘' '↘' -25.0]
 ['↗' '↗' -25.0 '↘' -25.0 '→' 100.0]
 [-1.0 -25.0 -25.0 -1.0 '↗' -25.0 '↗']
 [-1.0 -1.0 -1.0 -25.0 -25.0 -25.0 '↗']]
