In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [266]:
class Grid:
    def __init__(self, nrows, ncols, pos_start, pos_goal, pos_obstacles, epsilon=0.9):
        self.nrows = nrows
        self.ncols = ncols
        self.grid = np.zeros((nrows, ncols))
        self.pos_start = pos_start
        self.pos_goal = pos_goal
        self.pos_obstacles = pos_obstacles
        self.epsilon = epsilon
        
        self.UP = 0
        self.RIGHT = 1
        self.DOWN = 2
        self.LEFT = 3
        
        self.set_grid()
        self.Q = self.initialise_q_values()
    
    def set_grid(self):
        self.grid[self.pos_start] = 1
        self.grid[self.pos_goal] = 2
        self.grid[self.pos_obstacles[:, 0], self.pos_obstacles[:, 1]] = 3
        
    def initialise_q_values(self):
        Q = np.random.rand(nrows, ncols, 4)
        Q[self.pos_goal[0], self.pos_goal[1], :] = [0]*4
        return Q
        
    def get_available_actions(self, state):
        
        assert state[0] >=0 and state[0] < self.nrows, "Row index excedded"
            
        assert state[1] >=0 and state[1] < self.ncols, "Column index excedded"
        
        if state[0] == 0:
            if state[1] == 0:
                return [self.UP, self.RIGHT]
            elif state[1] == self.ncols-1:
                return [self.UP, self.LEFT]
            else:
                return [self.UP, self.RIGHT, self.LEFT]
        elif state[0] == self.nrows - 1:
            if state[1] == 0:
                return [self.RIGHT, self.DOWN]
            elif state[1] == self.ncols-1:
                return [self.DOWN, self.LEFT]
            else:
                return [self.RIGHT, self.DOWN, self.LEFT]    
        elif state[1] == 0:
            return [self.UP, self.RIGHT, self.DOWN]
        elif state[1] == self.ncols - 1:
            return [self.UP, self.DOWN, self.LEFT]
        else:
            return [self.UP, self.RIGHT, self.DOWN, self.LEFT]        
    
    def choose_action(self, state, epsilon):
        actions = [0, 1, 2, 3]
#         print("state: ", state)
#         print("Q values: ", self.Q[state])
        max_q_value_idx = np.argmax(self.Q[state])
        
        greedy_prob = epsilon + (1.0-epsilon)/len(actions)
        rem_prob = (1.0 - greedy_prob)/(len(actions) - 1)
        
        probs = [rem_prob]*len(actions)
        probs[max_q_value_idx] = greedy_prob
        
        return np.random.choice(actions, p=probs)
    
    def step(self, state):
        chosen_action = None
        
        while True:
            action = self.choose_action(state, self.epsilon)
            if action in self.get_available_actions(state):
                chosen_action = action
                break
        
        new_state = None
        if chosen_action == 0:
            new_state = (state[0]+1, state[1])
        elif chosen_action == 1:
            new_state = (state[0], state[1]+1)
        elif chosen_action == 2:
            new_state = (state[0]-1, state[1])
        else:
            new_state = (state[0], state[1]-1)
            
        return new_state
    
    def get_reward(self, state_1):
        if self.grid[state_1] == 2:
            return 20
        elif self.grid[state_1] == 3:
            return -100
        else:
            return -1
        
        
    def plot_grid(self):
        self.grid[self.pos_start] = 2
        self.grid[self.pos_goal] = 2
        plt.imshow(np.flip(self.grid, axis=0))

In [267]:
nrows, ncols = 4, 21
pos_start = (0,0)
pos_goal = (0, 20)
pos_obstacles = np.array([(0, i) for i in range(1, ncols-1)])

grid = Grid(nrows, ncols, pos_start, pos_goal, pos_obstacles)

In [282]:
state=pos_start
grid.step((3, 20))

(3, 19)

In [252]:
grid.Q[(3, 20)]

array([0.14387393, 0.28822011, 0.29245218, 0.98957469])