Osnabrück University - Machine Learning (Summer Term 2016) - Prof. Dr.-Ing. G. Heidemann, Ulf Krumnack

# Exercise Sheet 10

## Introduction

This week's sheet should be solved and handed in before the end of **Sunday, June 26, 2016**. If you need help (and Google and other resources were not enough), feel free to contact your groups' designated tutor or whomever of us you run into first. Please upload your results to your group's Stud.IP folder.

## Assignment 1:  [x Points]

## Assignment 2: Ultimate Dinosaur 3000 M4ze Xtrem!  [10 Points]

In this assignment your task will be to use unsupervised learning methods and create the greatest dinosaur-maze simulation that the world has ever seen.

In [None]:
import numpy as np
import numpy.random as rand
import scipy.ndimage as ndimage


def generate_field(x, y, num_rewards, max_reward):
    """
    Generate a random game field with rewards.
    
    Args:
        x            x dimension of the field
        y            y dimension of the field 
        num_rewards  the number of rewards that should be randomly placed
        max_reward   the maximum reward that can be placed 
        
    Returns:
        A field with randomly initialized rewards, the rest of the 
        entries is zero
    """
    field = np.zeros((x,y), dtype=np.uint8)
    
    for i in range(num_rewards):
        field[rand.randint(x), rand.randint(y)] = rand.choice(max_reward)
    
    return field

In [None]:
def softmax(x):
    """
    Softmax algorithm after the forumla: e^x/sum(e^x)
    """
    e_x = np.exp(x) 
    return e_x / e_x.sum()

In [None]:
class QLearning:
    """
    This class contains all the necessary methods to navigate through
    a maze or game with the help of a little bit of Q-Learning.
    """
    
    def __init__(self, learning_rate, map_x, map_y):
        """
        Initializes the QLearning Algorithm with the necessary parameters.
        
        Args:
            learning_rate  the gamma in the lecture slides
            map_x          x-dimension of the map
            map_y          y-dimension of the map
        
        Returns:
            An instance that can be used for QLearning on the field
        """
        # q stores the q_values for each action in each space of the field
        self.q = np.zeros((len(ACTIONS), map_x, map_y))
        self.gamma = learning_rate
        # start on a random position in the field
        self.pos = [np.random.randint(map_x), np.random.randint(map_y)]
        # remember the map extend for further navigation
        self.map_x = map_x
        self.map_y = map_y
    
    def get_coordinates(self, choice):
        """
        Returns the coordinates that follow a certain choice, depending
        on the current position of the learner. If the border is reached
        the agent just stops there.
        """
        y_new = self.pos[0]
        x_new = self.pos[1]
        
        if   choice == 'up'   : x_new -= 1 if x_new > 0 else 0
        elif choice == 'down' : x_new += 1 if x_new < self.map_x - 1 else 0            
        elif choice == 'left' : y_new -= 1 if y_new > 0 else 0                
        elif choice == 'right': y_new += 1 if y_new < self.map_y - 1 else 0
        else: raise ActionError('No such action:', name)
            
        return (x_new, y_new)
        
        
    def update(self):
        """
        Implementation of the update step. Closely follows the Algorithm described on
        ML-10 Sl.18
        """
        # get qvals for the current state of the player
        qvals = self.q[:,self.pos[0], self.pos[1]]
        # select next action and exectue it
        dist = softmax(np.asarray(qvals))
        choice = np.random.choice(ACTIONS, p=dist)
        
        #receive the reward for this
        rew = FIELD[self.pos[0],self.pos[1]]
        
        #observe new state
        new_pos = self.get_coordinates(choice)
        choice_i = ACTIONS.index(choice)
        
        #update q-value
        self.q[choice_i, self.pos[0], self.pos[1]] = rew + self.gamma*max(self.q[:, new_pos[0], new_pos[1]])
        
        #update current position
        self.pos = new_pos
        
        return self.q       

In [None]:
%matplotlib notebook

import matplotlib.pyplot as plt

# determine maze size an learning iterations
m_x = 10
m_y = 10

steps = 100

# set global variables

ACTIONS = ['up','down','right','left']

FIELD = generate_field(m_x, m_y, 4, 90)

# Plotting the generated field (should be beautiful!)
figure = plt.figure('Field')
plt.axis('off')
plt.imshow(FIELD, interpolation='none')
figure.canvas.draw()

#generate player
player = QLearning(0.9, m_x, m_y)

# let the player run through the field
for i in range(steps):     
    player_map = player.update()
    # FIXME: fancy plotting belongs here
    
print(player_map)

In [None]:
import numpy as np

maze = np.array([[0, 0, 1], [0, 0, 0]])
actions = [lambda c : (c[0], c[1] + 1),
           lambda c : (c[0], c[1] - 1),
           lambda c : (c[0] - 1, c[1]),
           lambda c : (c[0] + 1, c[1])]

def move(pos, direction):
    new_pos = actions[direction](pos)
    for dim, c in enumerate(new_pos):
        if c < 0 or c >= maze.shape[dim]:
            raise ValueError('Action impossible.')
    return new_pos

# (Initialize parameters)
gamma = 0.9

# Initialize q(s, a) <- 0
q = np.zeros((np.prod(maze.shape), len(actions)))

# Observe current state s
position = (0, 0)
s = np.ravel_multi_index(position, maze.shape)

# Repeat
for iteration in range(10000):
    # Select action a
    a = np.random.randint(len(actions))
    # Execute action a (if possible)
    try:
        position = move(position, a)
    except ValueError:
        continue

    # Receive reward r
    r = maze[position]
    # Observe new state s_n
    s_n = np.ravel_multi_index(position, maze.shape)

    # Update q(s, a)
    q[s, a] = r + gamma * np.max(q[s_n, :])

    # Update s
    s = s_n

print(q)