In [1]:
import numpy as np
import pprint
from operator import itemgetter
# from mdp_matrix import GridWorld
from sarsa import sarsa
from expected_sarsa import expected_sarsa
from double_sarsa import double_sarsa
from double_expected_sarsa import double_expected_sarsa

In [55]:
import numpy as np

class MDP:
    def __init__(self, T, S, R, A, act_list, terminal_states):
        # State space
        # Integer number of states
        self.S = S

        # Transition probabilities
        # Form: np ndarray of shape (start_state, action, end_state)
        self.T = np.array(T)

        # Reward space
        # Form: vector, rewards for each state
        self.R = np.array(R)

        # Action space
        # integer, number of possible actions
        self.A = A

        # Possible actions in the MDP
        self.actions = act_list

        self.terminal_states = terminal_states

    def is_terminal(self, s):
        return s in self.terminal_states


class WindyGridCliffMazeWorld(MDP):
    def __init__(self, grid_size, reward_pos, terminal_states, obstacles = [[i, j, 0] for i in range(10) for j in range(10)]
):
        S = grid_size*grid_size
        
        R = np.zeros((grid_size, grid_size))
        
        # Each row of reward_pos is a tuple: x, y, reward
        for row in reward_pos:
            R[row[0], row[1]] = row[2]
        R = R.flatten()

        # Define actions
        A = 4
        act_list = ['S', 'E', 'N', 'W']
        
        p_success = 0.7
        T = np.zeros((S, A, S))
        for start_state in range(S):
            state_i = start_state/grid_size
            state_j = (start_state)%grid_size

            # Actions indexed as: 0:S, 1:E, 2:N, 3:W
            for act in range(A):
                feas_grid = np.zeros((grid_size, grid_size))
                if(act == 0 ): # Going South
                    if state_i+1 < grid_size and obstacles[(state_i+1)*grid_size+state_j][2] != 1:
                        feas_grid[state_i+1, state_j] = p_success
                    else:
                        feas_grid[state_i, state_j] = p_success
                        
                    if state_j+1 < grid_size and obstacles[(state_i)*grid_size+state_j+1][2] != 1:
                        feas_grid[state_i, state_j+1] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                    if state_i-1 >= 0 and obstacles[(state_i-1)*grid_size+state_j][2] != 1:
                        feas_grid[state_i-1, state_j] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                    if state_j-1 >= 0 and obstacles[(state_i)*grid_size+state_j-1][2] != 1:
                        feas_grid[state_i, state_j-1] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        
                elif(act == 1): # Going East
                    if state_j+1 < grid_size and obstacles[state_i*grid_size+state_j+1][2] != 1:
                        feas_grid[state_i, state_j+1] = p_success
                    else:
                        feas_grid[state_i, state_j] = p_success
                    
                    if state_i+1 < grid_size and obstacles[(state_i+1)*grid_size+state_j][2] != 1:
                        feas_grid[state_i+1, state_j] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                    if state_i-1 >= 0 and obstacles[(state_i-1)*grid_size+state_j][2] != 1:
                        feas_grid[state_i-1, state_j] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                    if state_j-1 >= 0 and obstacles[(state_i)*grid_size+state_j-1][2] != 1:
                        feas_grid[state_i, state_j-1] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        
                elif(act == 2): # Going North
                    if state_i-1 >= 0 and obstacles[(state_i-1)*grid_size+state_j][2] != 1:
                        feas_grid[state_i-1, state_j] = p_success
                    else:
                        feas_grid[state_i, state_j] = p_success

                    if state_j+1 < grid_size and obstacles[(state_i)*grid_size+state_j+1][2] != 1:
                        feas_grid[state_i, state_j+1] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                    if state_i+1 < grid_size and obstacles[(state_i+1)*grid_size+state_j][2] != 1:
                        feas_grid[state_i+1, state_j] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                    if state_j-1 >= 0 and obstacles[(state_i)*grid_size+state_j-1][2] != 1:
                        feas_grid[state_i, state_j-1] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                        
                elif(act == 3): # Going West
                    if state_j-1 >= 0 and obstacles[(state_i)*grid_size+state_j-1][2] != 1:
                        feas_grid[state_i, state_j-1] = p_success
                    else:
                        feas_grid[state_i, state_j] = p_success
                        
                    if state_j+1 < grid_size and obstacles[(state_i)*grid_size+state_j+1][2] != 1:
                        feas_grid[state_i, state_j+1] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                    if state_i+1 < grid_size and obstacles[(state_i+1)*grid_size+state_j][2] != 1:
                        feas_grid[state_i+1, state_j] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0
                    if state_i-1 >= 0 and obstacles[(state_i-1)*grid_size+state_j][2] != 1:
                        feas_grid[state_i-1, state_j] = (1.0-p_success)/3.0
                    else:
                        feas_grid[state_i, state_j ] += (1.0-p_success)/3.0

                # Flatten the feasibility grid and assign to transition matrix
                T[start_state, act, :] = feas_grid.flatten()
        MDP.__init__(self, T, S, R, A, act_list, terminal_states)

In [56]:
%load_ext autoreload
%autoreload 2
test_rewards = [[i, j, -1.0] for i in range(10) for j in range(10)]
# test_rewards[2] = [0, 2, 1]
test_rewards[99] = [9, 9, 50]
# test_rewards = [[0, 3, 5],
#                 [0, 1, 10]]
terminal_states = [99]
obstacles = [[i, j, 0] for i in range(10) for j in range(10)]
obstacles[10] = [1, 0, 1]
obstacles[1] = [0, 1, 1]
gw = WindyGridCliffMazeWorld(10, test_rewards, terminal_states)
print np.reshape(gw.R, (10,10))
print np.reshape(gw.T[0, 0, :], (10,10))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[[ -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  50.]]
[[ 0.2  0.1  0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.7  0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.

In [53]:
[sarsa_Q, sarsa_avg_reward] = sarsa(gw, 1000)
print sarsa_avg_reward

KeyboardInterrupt: 

In [5]:
[expected_sarsa_Q, expected_sarsa_avg]  = expected_sarsa(gw, 1000)
print np.reshape(np.argmax(expected_sarsa_Q, 1), (5,5))
print expected_sarsa_avg

KeyboardInterrupt: 

In [None]:
[double_sarsa_Q, double_sarsa_avg]  = double_sarsa(gw, 100000)
print double_sarsa_avg

In [None]:
[double_expected_sarsa_Q, double_expected_sarsa_avg]  = double_expected_sarsa(gw, 100000)
print double_expected_sarsa_avg