Osnabrück University - Machine Learning (Summer Term 2016) - Prof. Dr.-Ing. G. Heidemann, Ulf Krumnack

# Exercise Sheet 10

## Introduction

This week's sheet should be solved and handed in before the end of **Sunday, June 26, 2016**. If you need help (and Google and other resources were not enough), feel free to contact your groups' designated tutor or whomever of us you run into first. Please upload your results to your group's Stud.IP folder.

## Assignment 1:  [x Points]

## Assignment 2: Ultimate Dinosaur 3000 M4ze Xtrem!  [10 Points]

In this assignment your task will be to use unsupervised learning methods and create the greatest dinosaur-maze simulation that the world has ever seen.

here is some q-learning example, based on code from Moritz Meier, that I tried to put into a notebook ... maye that helps ... otherwise just ignore

In [None]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from collections import defaultdict
from itertools import chain, count

In [None]:
def softmax(x):
    '''
    Softmax algorithm after the forumla: e^x/sum(e^x)
    '''
    e_x = np.exp(x) 
    return e_x / e_x.sum()

In [None]:
class QLearner:
    def __init__(self, actions, discount_factor, learning_rate, q_init):
        self.gamma = discount_factor
        self.alpha = learning_rate
        self.actions = actions
        self.q_init = q_init
        factory = lambda : {a : q_init for a in actions}
        self.q_dict = defaultdict(factory)
        
    def update(self, state, action, reward, next_state, mean_val=False):
        old_q = self.q_dict[state][action]
        if not mean_val:
            max_q = max(self.q_dict[next_state].values())
        else:
            max_q = np.mean(list(self.q_dict[next_state].values()))
        new_q = old_q + self.alpha*(reward + self.gamma*max_q - old_q)
        self.q_dict[state][action] = new_q

    def choose_action(self, state):
        actions, qvals = zip(*self.q_dict[state].items())
        dist = softmax(np.asarray(qvals))
        return np.random.choice(actions, p=dist)
    
    def q_map(self, average=False):
        if self.q_dict.keys():
            X, Y = zip(*self.q_dict.keys())
            qmap = np.zeros([max(X)+2, max(Y)+2])#*float('-inf')
            for (x,y),actions in self.q_dict.items():
                if average:
                    qmap[x,y] = np.mean(list(actions.values()))
                else:
                    qmap[x,y] = max(actions.values())
        else:
            qmap = np.zeros([1,1])
        return qmap
    
    def q_map_for_shape(self, shape, average=False):
        qmap = np.zeros(shape)
        if self.q_dict.keys():
            X, Y = zip(*self.q_dict.keys())
            for (x,y),actions in self.q_dict.items():
                if average:
                    qmap[x,y] = np.mean(list(actions.values()))
                else:
                    qmap[x,y] = max(actions.values())
        return qmap

In [None]:
# Global variables for the labyrinth

PLAYER = 2
EMPTY = 0
EXIT = 1
SPIKES = 5
WALL = 10

rewards = {
    EMPTY : -1,
    EXIT : 100,
    WALL : -2,
    SPIKES : -100
}

passable = [EMPTY,SPIKES, EXIT]
removable = []
restarts = [SPIKES, EXIT]

code = {}
for i,L in {
    WALL   : ['|','-','+'],
    EMPTY  : [' '],
    SPIKES : ['x','X'],
    EXIT   : ['E'],
    PLAYER : ['P'],
}.items():
    for c in L:
        code[c] = i

In [None]:
class MazeGame:
    def __init__(self, maze, start_pos = None):
        self.start_pos = start_pos
        self.start_maze = maze
        self.actions = ['up','down','right','left']
        self.restart()
        self.exit = False
    
    @property
    def state(self):
        return (self.x, self.y)
        
    def restart(self):
        self.maze = self.start_maze.copy()
        if self.start_pos:
            self.x = self.start_pos[0]
            self.y = self.start_pos[1]
        else:
            self.x = np.random.randint(1,self.maze.shape[0]-1)
            self.y = np.random.randint(1,self.maze.shape[1]-1)
            while self.maze[self.x, self.y] != EMPTY:
                self.x = np.random.randint(1,self.maze.shape[0]-1)
                self.y = np.random.randint(1,self.maze.shape[1]-1)
        self.exit = False
        
    def do_action(self, name):
        y_new = self.y
        x_new = self.x
        
        if   name == 'up'   : x_new -= 1
        elif name == 'down' : x_new += 1             
        elif name == 'left' : y_new -= 1                 
        elif name == 'right': y_new += 1
        else: raise ActionError('No such action:', name)

        new_field = self.maze[x_new, y_new]
        reward = rewards[new_field]
        
        if self.maze[x_new, y_new] in restarts:
            self.exit = True

        if self.maze[x_new, y_new] in passable:
            self.x = x_new
            self.y = y_new
        else:
            #print('not passable')
            pass

        if self.maze[x_new, y_new] in removable:
            if self.maze[x_new, y_new] in passable:
                self.maze[x_new, y_new] = EMPTY
            else:
                self.maze[x_new, y_new] = WALL
        return reward

    
class ActionError(Exception):
    pass

In [None]:
def create_maze(str_maze):
    maze = []
    start_pos = (0,0)
    for x,line in enumerate(str_maze.split('\n')):
        row = []
        for y,c in enumerate(line):
            if code[c] == PLAYER:
                start_pos = (x,y)
                row.append(EMPTY)
            else:
                row.append(code[c])
        maze.append(row)
    
    maze = np.asarray(maze)
    xlen = maze.shape[0]
    ylen = maze.shape[1]
    frame = np.ones((xlen+2,ylen+2))*WALL
    frame[1:-1,1:-1] -= np.ones((xlen,ylen))*WALL
    frame[1:-1,1:-1] += maze
    
    return frame, (start_pos[0]+1,start_pos[1]+1)

In [None]:
def plot_maze(maze, ax):
    plt.sca(ax)
    ax.set_xticks(np.arange(1,maze.shape[0])-0.5)                                              
    ax.set_yticks(np.arange(1,maze.shape[1])-0.5)
    plt.tick_params(
        labelbottom='off',
        labelleft='off',
        bottom='off',
        top='off',
        left='off',
        right='off',
    )
    plt.tight_layout()
    im = plt.imshow(maze, interpolation='nearest', cmap=plt.cm.spectral_r, animated=True)
    #cmap=plt.cm.CMRmap_r
    #cmap=plt.cm.gnuplot2_r
    plt.grid()
    #plt.colorbar()
    return im

In [None]:
def plot_fitmap(fmap, ax):
    plt.sca(ax)
    ax.set_xticks(np.arange(1,fmap.shape[0])-0.5)                                              
    ax.set_yticks(np.arange(1,fmap.shape[1])-0.5)
    plt.tick_params(
        labelbottom='off',
        labelleft='off',
        bottom='off',
        top='off',
        left='off',
        right='off',
    )
    plt.tight_layout()
    im = plt.imshow(fmap, interpolation='nearest', animated=True, cmap=plt.cm.spectral_r)
    plt.grid()
    #plt.colorbar()
    return im

In [None]:
maze_str = '\n'.join([
    "X                  ",
    "X                  ",
    "X                  ",
    "X        ----------",
    "X                  ",
    "   P -+            ",
    "      |     |      ",
    "      |     |      ",
    "  |   |     |      ",
    "  |   |     |      ",
    "  |   |     |      ",
    "  |   |     |      ",
    "  |   |     |      ",
    " X|         |    EE"])
start_maze,start_pos = create_maze(maze_str)

In [None]:
import time


# You may interrupt this cell, change parameters (show_*) and then restart the cell again.
# If this flag is set to True, the learner will continue where it was stopped.
continue_old_game = False

# show_maze: indicates if the map
show_maze = True
show_qmap = True
# only show output when a game was finished (goal was reached)
# This will speed up the process ...
show_only_after_game = True

if (not 'game' in globals()) or not continue_old_game:
    game = MazeGame(start_maze, start_pos)
    #game = MazeGame(start_maze)
    player = QLearner(
            discount_factor=0.9,
            learning_rate=0.9,
            q_init=10,
            actions = game.actions)

fig = plt.figure()
if show_maze: ax_maze = fig.add_subplot(1,2,1)
if show_qmap: ax_qmap = fig.add_subplot(1,2,2)

for _ in count():
    show = (not show_only_after_game) or game.exit

    if game.exit:
        game.restart()
    else:
        old_state = game.state
        action = player.choose_action(old_state)
        reward = game.do_action(action)
        new_state = game.state
        
        player.update(
            state=old_state,
            next_state=new_state, 
            reward=reward,
            action=action,
            mean_val=False)
    
    if show:
        if show_maze:
            maze = game.maze.copy()
            maze[game.x, game.y] = PLAYER
            plot_maze(maze, ax_maze)

        if show_qmap:
            plot_fitmap(player.q_map_for_shape(maze.shape), ax_qmap)
        
        if show_maze or show_qmap:
            fig.canvas.draw()
            time.sleep(.01)


In [None]:
import numpy as np
import numpy.random as rand
import scipy.ndimage as ndimage


def generate_field(x, y, num_rewards, max_reward):
    '''
    Generate Field with the rewards
    '''
    field = np.zeros((x,y), dtype=np.uint8)
    
    for i in range(num_rewards):
        field[rand.randint(x), rand.randint(y)] = rand.choice(max_reward)
    
    return field

In [None]:
%matplotlib notebook

import matplotlib.pyplot as plt

# Testing the maze
m_x = 35
m_y = 30
m = generate_field(m_x, m_y, 4, 90)

figure = plt.figure('Field')
plt.axis('off')

plt.imshow(m, interpolation='none')
figure.canvas.draw()