In [2]:
import numpy as np

In [3]:
ACTION_SPACE = ('U', 'D', 'L', 'R')

In [4]:
class GridWorld:
    def __init__(self, rows, cols, start_pos : tuple) :
        self.rows = rows
        self.cols = cols
        self.i = start_pos[0]
        self.j = start_pos[1]
        
    def set_rewards(self, rewards) :
        self.rewards = rewards
        
    def set_actions(self, actions):
        #actions : dict if (i, j) : A
        self.actions = actions
        
    def set_state(self, state) :
        self.i = state[0]
        self.j = state[1]
        
    def current_state(self) :
        return self.i, self.j
    
    def is_terminal(self, s) :
        #terminal state : state from where we cannot move from
        return s not in self.actions
    
    def get_next_state(self, s, a) :
        #action is here is just one of ['U', 'D', 'L', 'R']
        i, j = s[0], s[1]
        if a in self.actions[(i, j)] :
            if a == 'U' :
                i -= 1
            elif a == 'D' :
                i += 1
            elif a == 'L' :
                j -= 1
            elif a == 'R' :
                j += 1
        return i, j
    
    def move(self, a) :
        if a in self.actions[(self.i, self.j)] :
            if a == 'U' :
                self.i -= 1
            elif a == 'D' :
                self.i += 1
            elif a == 'L' :
                self.j -= 1
            elif a == 'R' :
                self.j += 1
        return self.rewards.get((self.i, self.j), 0) #default reward 0
    
    def undo_move(self, a) :
        if a == 'U' :
            self.i += 1
        elif a == 'D' :
            self.i -= 1
        elif a == 'L' :
            self.j += 1
        elif a == 'R' :
            self.j -= 1
        assert(self.current_state() in self.all_states())
        
    def game_over(self) :
        return (self.i, self.j) not in self.actions
    
    def all_states(self) :
        return set(self.actions.keys()) | set(self.rewards.keys())
    
def standard_grid() :
    g = GridWorld(3, 4, (2, 0))
    rewards = {(0, 3) : 1, (1, 3) : -1}
    actions = {
        (0, 0) : ('D', 'R'),
        (0, 1) : ('R', 'L'),
        (0, 2) : ('R', 'L', 'B'),
        (1, 0) : ('U', 'B'),
        (1, 2) : ('U', 'R', 'B'),
        (2, 0) : ('U', 'R'),
        (2, 1) : ('L', 'R'),
        (2, 2) : ('L', 'U', 'R'),
        (2, 3) : ('L', 'U')
    }
    g.set_rewards(rewards)
    g.set_actions(actions)
    return g

In [6]:
class WindyGrid:
    def __init__(self, rows, cols, start_pos : tuple) :
        self.rows = rows
        self.cols = cols
        self.i = start_pos[0]
        self.j = start_pos[1]
        
    def set_rewards(self, rewards) :
        self.rewards = rewards
        
    def set_actions(self, actions):
        #actions : dict if (i, j) : A
        self.actions = actions
        
    def set_probs(self, probs) :
        self.probs = probs
        
    def set_state(self, state) :
        self.i = state[0]
        self.j = state[1]
        
    def current_state(self) :
        return self.i, self.j
    
    def is_terminal(self, s) :
        #terminal state : state from where we cannot move from
        return s not in self.actions
    
    def move(self, a) :
        s = self.i, self.j
        next_state_probs = self.probs((s, a))
        next_states = list(next_state_probs.keys())
        next_probs = list(next_state_probs.values())
        s2 = np.random.choice(next_states, p = next_probs)
        self.i, self.j = s2
        
        return self.rewards.get(s2, 0) #default reward 0
    
    def undo_move(self, a) :
        if a == 'U' :
            self.i += 1
        elif a == 'D' :
            self.i -= 1
        elif a == 'L' :
            self.j += 1
        elif a == 'R' :
            self.j -= 1
        assert(self.current_state() in self.all_states())
        
    def game_over(self) :
        return (self.i, self.j) not in self.actions
    
    def all_states(self) :
        return set(self.actions.keys()) | set(self.rewards.keys())
    
def windy_grid() :
    g = WindyGrid(3, 4, (2, 0))
    rewards = {(0, 3) : 1, (1, 3) : -1}
    actions = {
        (0, 0) : ('D', 'R'),
        (0, 1) : ('R', 'L'),
        (0, 2) : ('R', 'L', 'B'),
        (1, 0) : ('U', 'B'),
        (1, 2) : ('U', 'R', 'B'),
        (2, 0) : ('U', 'R'),
        (2, 1) : ('L', 'R'),
        (2, 2) : ('L', 'U', 'R'),
        (2, 3) : ('L', 'U')
    }
    # p(s' | s, a) represented as:
    # KEY: (s, a) --> VALUE: {s': p(s' | s, a)}
    probs = {
        ((2, 0), 'U'): {(1, 0): 1.0},
        ((2, 0), 'D'): {(2, 0): 1.0},
        ((2, 0), 'L'): {(2, 0): 1.0},
        ((2, 0), 'R'): {(2, 1): 1.0},
        ((1, 0), 'U'): {(0, 0): 1.0},
        ((1, 0), 'D'): {(2, 0): 1.0},
        ((1, 0), 'L'): {(1, 0): 1.0},
        ((1, 0), 'R'): {(1, 0): 1.0},
        ((0, 0), 'U'): {(0, 0): 1.0},
        ((0, 0), 'D'): {(1, 0): 1.0},
        ((0, 0), 'L'): {(0, 0): 1.0},
        ((0, 0), 'R'): {(0, 1): 1.0},
        ((0, 1), 'U'): {(0, 1): 1.0},
        ((0, 1), 'D'): {(0, 1): 1.0},
        ((0, 1), 'L'): {(0, 0): 1.0},
        ((0, 1), 'R'): {(0, 2): 1.0},
        ((0, 2), 'U'): {(0, 2): 1.0},
        ((0, 2), 'D'): {(1, 2): 1.0},
        ((0, 2), 'L'): {(0, 1): 1.0},
        ((0, 2), 'R'): {(0, 3): 1.0},
        ((2, 1), 'U'): {(2, 1): 1.0},
        ((2, 1), 'D'): {(2, 1): 1.0},
        ((2, 1), 'L'): {(2, 0): 1.0},
        ((2, 1), 'R'): {(2, 2): 1.0},
        ((2, 2), 'U'): {(1, 2): 1.0},
        ((2, 2), 'D'): {(2, 2): 1.0},
        ((2, 2), 'L'): {(2, 1): 1.0},
        ((2, 2), 'R'): {(2, 3): 1.0},
        ((2, 3), 'U'): {(1, 3): 1.0},
        ((2, 3), 'D'): {(2, 3): 1.0},
        ((2, 3), 'L'): {(2, 2): 1.0},
        ((2, 3), 'R'): {(2, 3): 1.0},
        ((1, 2), 'U'): {(0, 2): 0.5, (1, 3): 0.5},
        ((1, 2), 'D'): {(2, 2): 1.0},
        ((1, 2), 'L'): {(1, 2): 1.0},
        ((1, 2), 'R'): {(1, 3): 1.0},
    }
    g.set_rewards(rewards)
    g.set_actions(actions)
    g.set_probs(probs)
    return g

def windy_grid_penalized(step_cost = -0.1) :
    g = WindyGrid(3, 4, (2, 0))
    rewards = {
        (0, 0) : step_cost,
        (0, 1) : step_cost,
        (0, 2) : step_cost,
        (1, 0) : step_cost,
        (1, 2) : step_cost,
        (2, 0) : step_cost,
        (2, 1) : step_cost,
        (2, 2) : step_cost,
        (2, 3) : step_cost,
        (0, 3) : 1,
        (1, 3) : -1,
    }
    actions = {
        (0, 0) : ('D', 'R'),
        (0, 1) : ('R', 'L'),
        (0, 2) : ('R', 'L', 'B'),
        (1, 0) : ('U', 'B'),
        (1, 2) : ('U', 'R', 'B'),
        (2, 0) : ('U', 'R'),
        (2, 1) : ('L', 'R'),
        (2, 2) : ('L', 'U', 'R'),
        (2, 3) : ('L', 'U')
    }
    # p(s' | s, a) represented as:
    # KEY: (s, a) --> VALUE: {s': p(s' | s, a)}
    probs = {
        ((2, 0), 'U'): {(1, 0): 1.0},
        ((2, 0), 'D'): {(2, 0): 1.0},
        ((2, 0), 'L'): {(2, 0): 1.0},
        ((2, 0), 'R'): {(2, 1): 1.0},
        ((1, 0), 'U'): {(0, 0): 1.0},
        ((1, 0), 'D'): {(2, 0): 1.0},
        ((1, 0), 'L'): {(1, 0): 1.0},
        ((1, 0), 'R'): {(1, 0): 1.0},
        ((0, 0), 'U'): {(0, 0): 1.0},
        ((0, 0), 'D'): {(1, 0): 1.0},
        ((0, 0), 'L'): {(0, 0): 1.0},
        ((0, 0), 'R'): {(0, 1): 1.0},
        ((0, 1), 'U'): {(0, 1): 1.0},
        ((0, 1), 'D'): {(0, 1): 1.0},
        ((0, 1), 'L'): {(0, 0): 1.0},
        ((0, 1), 'R'): {(0, 2): 1.0},
        ((0, 2), 'U'): {(0, 2): 1.0},
        ((0, 2), 'D'): {(1, 2): 1.0},
        ((0, 2), 'L'): {(0, 1): 1.0},
        ((0, 2), 'R'): {(0, 3): 1.0},
        ((2, 1), 'U'): {(2, 1): 1.0},
        ((2, 1), 'D'): {(2, 1): 1.0},
        ((2, 1), 'L'): {(2, 0): 1.0},
        ((2, 1), 'R'): {(2, 2): 1.0},
        ((2, 2), 'U'): {(1, 2): 1.0},
        ((2, 2), 'D'): {(2, 2): 1.0},
        ((2, 2), 'L'): {(2, 1): 1.0},
        ((2, 2), 'R'): {(2, 3): 1.0},
        ((2, 3), 'U'): {(1, 3): 1.0},
        ((2, 3), 'D'): {(2, 3): 1.0},
        ((2, 3), 'L'): {(2, 2): 1.0},
        ((2, 3), 'R'): {(2, 3): 1.0},
        ((1, 2), 'U'): {(0, 2): 0.5, (1, 3): 0.5},
        ((1, 2), 'D'): {(2, 2): 1.0},
        ((1, 2), 'L'): {(1, 2): 1.0},
        ((1, 2), 'R'): {(1, 3): 1.0},
    }
    g.set_rewards(rewards)
    g.set_actions(actions)
    g.set_probs(probs)
    return g