In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
UP = 0
DOWN = 1
RIGHT = 2
LEFT = 3
ACTION = [UP, DOWN, RIGHT, LEFT]
high = 4
length = 12
START = [high-1, 0]
GOAL = [high-1, length-1]

q_values = np.zeros((high,length, len(ACTION)))

In [3]:
def policy(state, epsilon):
    """
    Choose action for given state. 
    :param state: current state 
    :param epsilon: the probability of random decisions
    :return: action 
    """
    if random.random() < epsilon:
        return np.random.choice(ACTION)
    else:
        return np.argmax(q_values[state[0]][state[1]])

In [4]:
def reset_game():
    """
    Return coordinates of start position. 
    """
    return START

In [5]:
def step(state, action):
    """
    Choose next state for given action and state. 
    :param state: current state 
    :param action: current action 
    :return: reward, next_state, done
    """
   
    i, j = state
     # Possible action
    if action == UP:
        next_state = [max(i-1, 0),j]
    elif action == DOWN:
        next_state = [min(i+1, high-1), j]
    elif action == LEFT:
        next_state = [i, max(j-1, 0)]
    elif action == RIGHT:
        next_state = [i, min(j+1, length-1)]

    # Rules
    if (action == DOWN and i == (high-2) and j>0 and j<(length-1))  or (action == RIGHT and i == (high-1) and j == 0):
        reward = -100
        done = True
    elif(action == DOWN and i == (high-2) and j == (length-1)):
        reward = 0
        done = True
    else:
        reward = -1
        done = False

    return next_state, reward, done

In [6]:
def sarsa(num_episodes=500, epsilon=0.5, learning_rate=0.1, gamma=0.9):
    """
    State-Action-Reward-State-Action (SARSA) 
    """
    rewards=[]
    for _ in range(num_episodes):
        state = reset_game()
        done = False
        reward_sum = 0
       
        while not done:
            action = policy(state, epsilon)
            next_state, reward, done = step(state, action)
            next_action = policy(next_state, epsilon)
            reward_sum+= reward
            # Q(S,A)<- Q(S,A) + alpha*(reward+gamma*Q(S_next, A_next)-Q(S,A))
            q_values[state[0]][state[1]][action]+=learning_rate*(reward + gamma*(q_values[next_state[0]][next_state[1]][next_action]-q_values[state[0]][state[1]][action]))

            state=next_state
        rewards.append(reward_sum)

    return  rewards

In [7]:
def find_action_with_max_q(i,j):
    """
    Find action from current position with max q_values 
    """
    return np.argmax(q_values[i][j])

In [8]:
def print_action(action):
    """
    Print action 
    """
    if action == 0: print("UP")
    elif action == 1: print("DOWN")
    elif action == 2: print("RIGHT")
    elif action == 3: print("LEFT")

In [9]:
def change_position_after_action(state, action):
    """
    Return next state after action 
    """
    i,j = state
    if action == UP:
        next_state = [max(i-1, 0),j]
    elif action == DOWN:
        next_state = [min(i+1, high-1), j]
    elif action == LEFT:
        next_state = [i, max(j-1, 0)]
    elif action == RIGHT:
        next_state = [i, min(j+1, length-1)]
    return next_state

In [10]:
def show_optimal_acion():
    """
    Show optimal action from START to GOAL 
    """
    actions = []
    action = find_action_with_max_q(START[0], START[1])
    next_state = change_position_after_action(START, action)
    actions.append(action)
    while next_state != GOAL:
        print_action(action)
        action = find_action_with_max_q(next_state[0], next_state[1])
        next_state =change_position_after_action(next_state, action)
        actions.append(action)
        if next_state == GOAL:
            print_action(action)
    return actions

In [11]:
rewards = sarsa(num_episodes=10000, epsilon=0.2)

In [12]:
actions = show_optimal_acion()

UP
UP
UP
RIGHT
RIGHT
RIGHT
RIGHT
RIGHT
RIGHT
RIGHT
RIGHT
RIGHT
RIGHT
RIGHT
DOWN
DOWN
DOWN


In [13]:
%%capture
from matplotlib.animation import FuncAnimation
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import HTML
from copy import deepcopy

fig, ax = plt.subplots(figsize=(5, 5))

state = np.asarray([
    [.2,.2,.2,.2,.2,.2,.2,.2,.2,.2,.2,.2],
    [.2,.2,.2,.2,.2,.2,.2,.2,.2,.2,.2,.2],
    [.2,.2,.2,.2,.2,.2,.2,.2,.2,.2,.2,.2],
    [.1,.5,.5,.5,.5,.5,.5,.5,.5,.5,.5,.2]
])

states = [state]

def calculate_path():
    count = 0
    for action in actions:
        print(action)
        if action == 0:
            pom_state = deepcopy(states[count])
            item_index = np.where(states[count] == .1)
            pom_state[int(item_index[0]), int(item_index[1])] = .2
            pom_state[int(item_index[0])-1, int(item_index[1])] = .1
            states.append(pom_state)
        elif action == 1:
            pom_state = deepcopy(states[count])
            item_index = np.where(states[count] == .1)
            pom_state[int(item_index[0]), int(item_index[1])] = .2
            pom_state[int(item_index[0])+1, int(item_index[1])] = .1
            states.append(pom_state)
        elif action == 2:
            pom_state = deepcopy(states[count])
            item_index = np.where(states[count] == .1)
            pom_state[int(item_index[0]), int(item_index[1])] = .2
            pom_state[int(item_index[0]), int(item_index[1])+1] = .1
            states.append(pom_state)
        elif action == 3:
            pom_state = deepcopy(states[count])
            item_index = np.where(states[count] == .1)
            pom_state[int(item_index[0]), int(item_index[1])] = .2
            pom_state[int(item_index[0]), int(item_index[1])-1] = .1
            states.append(pom_state)        
        count+=1
        
calculate_path()

def update(i):
    ax.imshow(states[i])
    ax.set_title("State {}".format(i), fontsize=20)
    ax.set_axis_off()

anim = FuncAnimation(fig, update, frames=np.arange(0, len(actions)+1), interval=160)
HTML(anim.to_jshtml())

In [14]:
HTML(anim.to_jshtml())