In [4]:
import numpy as np
import pprint
from operator import itemgetter
from mdp_matrix import GridWorld
from sarsa import sarsa
from expected_sarsa import expected_sarsa
from double_sarsa import double_sarsa
from double_expected_sarsa import double_expected_sarsa

In [29]:
import numpy as np

class MDP:
    def __init__(self, T, S, R, A, act_list, terminal_states):
        # State space
        # Integer number of states
        self.S = S

        # Transition probabilities
        # Form: np ndarray of shape (start_state, action, end_state)
        self.T = np.array(T)

        # Reward space
        # Form: vector, rewards for each state
        self.R = np.array(R)

        # Action space
        # integer, number of possible actions
        self.A = A

        # Possible actions in the MDP
        self.actions = act_list

        self.terminal_states = terminal_states

    def is_terminal(self, s):
        return s in self.terminal_states
    
class StochasticGridWorld(MDP):
    def __init__(self, grid_size, reward_pos, terminal_states, p_success = 0.7):
        S = grid_size*grid_size

        R = np.zeros((grid_size, grid_size))

        # Each row of reward_pos is a tuple: x, y, reward
        for row in reward_pos:
            R[row[0], row[1]] = row[2]
        R = R.flatten()

        A = 4
        act_list = ['S', 'E', 'N', 'W']

        T = np.zeros((S, A, S))
        for start_state in range(S):
            state_i = start_state/grid_size
            state_j = (start_state)%grid_size

            # Actions indexed as: 0:S, 1:E, 2:N, 3:W
            for act in range(A):
                feas_grid = np.zeros((grid_size, grid_size))
                if(act == 0 ): # Going South
                    if state_i+1 < grid_size:
                        feas_grid[state_i+1, state_j] = p_success
                    else:
                        feas_grid[state_i, state_j] = p_success

                    if state_j+1 < grid_size:
                        feas_grid[state_i, state_j+1] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        
                    if state_i-1 >= 0:
                        feas_grid[state_i-1, state_j] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        
                    if state_j-1 >= 0:
                        feas_grid[state_i, state_j-1] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0

                elif(act == 1): # Going East
                    if state_j+1 < grid_size:
                        feas_grid[state_i, state_j+1] = p_success
                    else:
                        feas_grid[state_i, state_j] = p_success

                    if state_i+1 < grid_size:
                        feas_grid[state_i+1, state_j] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        
                    if state_i-1 >= 0:
                        feas_grid[state_i-1, state_j] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        
                    if state_j-1 >= 0:
                        feas_grid[state_i, state_j-1] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0

                elif(act == 2): # Going North
                    if state_i-1 >= 0:
                        feas_grid[state_i-1, state_j] = p_success
                    else:
                        feas_grid[state_i, state_j] = p_success

                    if state_j+1 < grid_size:
                        feas_grid[state_i, state_j+1] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        
                    if state_i+1 < grid_size:
                        feas_grid[state_i+1, state_j] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        
                    if state_j-1 >= 0:
                        feas_grid[state_i, state_j-1] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0

                elif(act == 3): # Going West
                    if state_j-1 >= 0:
                        feas_grid[state_i, state_j-1] = p_success
                    else:
                        feas_grid[state_i, state_j] = p_success

                    if state_j+1 < grid_size:
                        feas_grid[state_i, state_j+1] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        
                    if state_i+1 < grid_size:
                        feas_grid[state_i+1, state_j] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        
                    if state_i-1 >= 0:
                        feas_grid[state_i-1, state_j] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0

                # Flatten the feasibility grid and assign to transition matrix
                T[start_state, act, :] = feas_grid.flatten()
        MDP.__init__(self, T, S, R, A, act_list, terminal_states)

class WindyGridCliffMazeWorld(MDP):
    def __init__(self, grid_size, reward_pos, terminal_states, traps, initial_state = [0,0], obstacles = [[i, j, 0] for i in range(10) for j in range(10)]
, p_success = 0.7):
        S = grid_size*grid_size
        
        R = np.zeros((grid_size, grid_size))
        
        # Each row of reward_pos is a tuple: x, y, reward
        for row in reward_pos:
            R[row[0], row[1]] = row[2]
        R = R.flatten()
        
        for i in range(len(traps)):
            if traps[i] == 1:
                R[i] = -25
                
        # Define actions
        A = 4
        act_list = ['S', 'E', 'N', 'W']
        
        # Set start state value
        self.initial_state = initial_state[0]*grid_size + initial_state[1]
        
        T = np.zeros((S, A, S))
        for start_state in range(S):
            state_i = start_state/grid_size
            state_j = (start_state)%grid_size

            # Actions indexed as: 0:S, 1:E, 2:N, 3:W
            for act in range(A):
                feas_grid = np.zeros((grid_size, grid_size))
                if traps[(state_i)*grid_size + state_j] == 1:
                    feas_grid[initial_state[0], initial_state[1]] = 1
                else:
                    if(act == 0 ): # Going South
                        if state_i+1 < grid_size and obstacles[(state_i+1)*grid_size+state_j][2] != 1:
                            feas_grid[state_i+1, state_j] = p_success
                        else:
                            feas_grid[state_i, state_j] = p_success

                        if state_j+1 < grid_size and obstacles[(state_i)*grid_size+state_j+1][2] != 1:
                            feas_grid[state_i, state_j+1] = (1.0-p_success)/3.0
                        else:
                            feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        if state_i-1 >= 0 and obstacles[(state_i-1)*grid_size+state_j][2] != 1:
                            feas_grid[state_i-1, state_j] = (1.0-p_success)/3.0
                        else:
                            feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        if state_j-1 >= 0 and obstacles[(state_i)*grid_size+state_j-1][2] != 1:
                            feas_grid[state_i, state_j-1] = (1.0-p_success)/3.0
                        else:
                            feas_grid[state_i, state_j ] += (1.0-p_success)/3.0

                    elif(act == 1): # Going East
                        if state_j+1 < grid_size and obstacles[state_i*grid_size+state_j+1][2] != 1:
                            feas_grid[state_i, state_j+1] = p_success
                        else:
                            feas_grid[state_i, state_j] = p_success

                        if state_i+1 < grid_size and obstacles[(state_i+1)*grid_size+state_j][2] != 1:
                            feas_grid[state_i+1, state_j] = (1.0-p_success)/3.0
                        else:
                            feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        if state_i-1 >= 0 and obstacles[(state_i-1)*grid_size+state_j][2] != 1:
                            feas_grid[state_i-1, state_j] = (1.0-p_success)/3.0
                        else:
                            feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        if state_j-1 >= 0 and obstacles[(state_i)*grid_size+state_j-1][2] != 1:
                            feas_grid[state_i, state_j-1] = (1.0-p_success)/3.0
                        else:
                            feas_grid[state_i, state_j ] += (1.0-p_success)/3.0

                    elif(act == 2): # Going North
                        if state_i-1 >= 0 and obstacles[(state_i-1)*grid_size+state_j][2] != 1:
                            feas_grid[state_i-1, state_j] = p_success
                        else:
                            feas_grid[state_i, state_j] = p_success

                        if state_j+1 < grid_size and obstacles[(state_i)*grid_size+state_j+1][2] != 1:
                            feas_grid[state_i, state_j+1] = (1.0-p_success)/3.0
                        else:
                            feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        if state_i+1 < grid_size and obstacles[(state_i+1)*grid_size+state_j][2] != 1:
                            feas_grid[state_i+1, state_j] = (1.0-p_success)/3.0
                        else:
                            feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        if state_j-1 >= 0 and obstacles[(state_i)*grid_size+state_j-1][2] != 1:
                            feas_grid[state_i, state_j-1] = (1.0-p_success)/3.0
                        else:
                            feas_grid[state_i, state_j ] += (1.0-p_success)/3.0

                    elif(act == 3): # Going West
                        if state_j-1 >= 0 and obstacles[(state_i)*grid_size+state_j-1][2] != 1:
                            feas_grid[state_i, state_j-1] = p_success
                        else:
                            feas_grid[state_i, state_j] = p_success

                        if state_j+1 < grid_size and obstacles[(state_i)*grid_size+state_j+1][2] != 1:
                            feas_grid[state_i, state_j+1] = (1.0-p_success)/3.0
                        else:
                            feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        if state_i+1 < grid_size and obstacles[(state_i+1)*grid_size+state_j][2] != 1:
                            feas_grid[state_i+1, state_j] = (1.0-p_success)/3.0
                        else:
                            feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        if state_i-1 >= 0 and obstacles[(state_i-1)*grid_size+state_j][2] != 1:
                            feas_grid[state_i-1, state_j] = (1.0-p_success)/3.0
                        else:
                            feas_grid[state_i, state_j ] += (1.0-p_success)/3.0

                # Flatten the feasibility grid and assign to transition matrix
                T[start_state, act, :] = feas_grid.flatten()
        MDP.__init__(self, T, S, R, A, act_list, terminal_states)

In [57]:
%load_ext autoreload
%autoreload 2
rewards = [[i, j, -1.0] for i in range(10) for j in range(10)]
rewards[59] = [5, 9, 50]

terminal_states = [59]

obstacles = [[i, j, 0] for i in range(10) for j in range(10)]
obstacles[0*10+3] = [0, 3, 1] 
obstacles[0*10+8] = [0, 8, 1] 
obstacles[1*10+1] = [1, 1, 1] 
obstacles[1*10+4] = [1, 4, 1] 
obstacles[1*10+5] = [1, 5, 1] 
obstacles[1*10+6] = [1, 6, 1] 
obstacles[2*10+1] = [2, 1, 1] 
obstacles[2*10+8] = [2, 8, 1] 
obstacles[3*10+1] = [3, 1, 1] 
obstacles[3*10+4] = [3, 4, 1] 
obstacles[3*10+5] = [3, 5, 1] 
obstacles[3*10+6] = [3, 6, 1] 
obstacles[5*10+0] = [5, 0, 1] 
obstacles[5*10+5] = [5, 5, 1] 
obstacles[6*10+3] = [6, 3, 1] 
obstacles[6*10+4] = [6, 4, 1] 
obstacles[6*10+5] = [6, 5, 1] 
obstacles[6*10+6] = [6, 6, 1] 
obstacles[6*10+8] = [6, 8, 1] 
obstacles[7*10+8] = [7, 8, 1] 
obstacles[9*10+4] = [9, 4, 1] 

start_state = [0, 0]

traps = [0]*100
traps[4] = 1
traps[9] = 1
traps[13] = 1
traps[33] = 1
traps[43] = 1
traps[47] = 1
traps[67] = 1
traps[72] = 1
traps[96] = 1

gw = WindyGridCliffMazeWorld(10, rewards, terminal_states, traps, start_state, obstacles)
print np.reshape(gw.R, (10,10))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[[ -1.  -1.  -1.  -1. -25.  -1.  -1.  -1.  -1. -25.]
 [ -1.  -1.  -1. -25.  -1.  -1.  -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1. -25.  -1.  -1.  -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1. -25.  -1.  -1.  -1. -25.  -1.  -1.]
 [ -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  50.]
 [ -1.  -1.  -1.  -1.  -1.  -1.  -1. -25.  -1.  -1.]
 [ -1.  -1. -25.  -1.  -1.  -1.  -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.  -1.  -1.  -1. -25.  -1.  -1.  -1.]]


In [49]:
print len(obstacles)

100


In [69]:
print np.reshape(gw.T[40, 1, :], (10,10))

[[ 0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.1  0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.2  0.7  0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]]


In [11]:
%load_ext autoreload
%autoreload 2
gw = GridWorld(10, test_rewards, terminal_states)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
[sarsa_Q, sarsa_avg_reward, sarsa_max_reward, sarsa_reward_per_episode] = sarsa(gw, 10000)
print sarsa_avg_reward
print sarsa_max_reward
# print sarsa_reward_per_episode

-3718.0851
34.0


20.0
[-2516.0, -936.0, -933.0, -2355.0, -2853.0, -6133.0, -471.0, -305.0, -7439.0, -793.0, -453.0, -953.0, -1465.0, -48.0, -881.0, -428.0, -1226.0, -115.0, -458.0, -742.0, -2963.0, -1582.0, -1615.0, -5769.0, -3048.0, -4986.0, -14554.0, -43194.0, -7148.0, -3665.0, -628.0, -782.0, -106.0, -902.0, -11.0, -272.0, -1492.0, -2825.0, -5695.0, -3614.0, -9096.0, -118.0, -310.0, -11754.0, -2113.0, -7624.0, -19383.0, -20471.0, -1626.0, -1355.0, -598.0, -2167.0, -295.0, -6943.0, -5110.0, -1408.0, -18897.0, -9127.0, -9507.0, -15645.0, -16774.0, -8571.0, -7847.0, -91519.0, -73276.0, -83134.0, -187798.0, -16510.0, -144424.0, -43508.0, -9551.0, -446.0, -2978.0, -13842.0, -3760.0, -13862.0, -655.0, -3742.0, -2402.0, -19484.0, -415.0, -5222.0, -4321.0, -209.0, -153.0, -503.0, -60.0, -714.0, -4288.0, -167.0, -831.0, -1162.0, -301.0, -814.0, -8465.0, -112.0, -311.0, -8940.0, -1994.0, -1624.0, -2800.0, -4183.0, -426.0, -8303.0, -124.0, -67.0, -3785.0, -371.0, -203.0, -82.0, -1005.0, -495.0, -2301.0, -774.0

In [71]:
print np.reshape(np.argmax(sarsa_Q, 1), (10,10))

[[0 3 3 0 0 1 1 0 0 3]
 [1 0 2 2 0 0 0 1 1 0]
 [2 0 2 3 1 1 1 2 0 0]
 [2 0 2 3 0 0 0 1 1 0]
 [2 0 0 1 1 1 0 2 1 0]
 [0 0 3 1 0 0 1 1 1 0]
 [0 3 2 0 0 0 0 2 0 2]
 [1 0 3 1 1 1 1 0 0 2]
 [0 0 0 1 1 2 3 1 1 2]
 [1 1 1 2 0 2 3 2 1 2]]


In [None]:
[expected_sarsa_Q, expected_sarsa_avg]  = expected_sarsa(gw, 1000)
print np.reshape(np.argmax(expected_sarsa_Q, 1), (5,5))
print expected_sarsa_avg

In [None]:
[double_sarsa_Q, double_sarsa_avg]  = double_sarsa(gw, 100000)
print double_sarsa_avg

In [None]:
[double_expected_sarsa_Q, double_expected_sarsa_avg]  = double_expected_sarsa(gw, 100000)
print double_expected_sarsa_avg

In [74]:
display(Image(url="images/maze.png", width=300, unconfined=True))

In [72]:
from IPython.core.display import Image, display
print("Epsilon = .1")
display(Image(url="images/stochastic_gw_rwd_vs_alpha.png", width=500, unconfined=True))

Epsilon = .1


In [None]:
As expected in 