In [1]:
import numpy as np

## environment class

In [2]:
class Grid:
    def __init__(self, rows, cols, start):
        self.rows = rows
        self.cols = cols
        self.i = start[0]
        self.j = start[1]

        
    def set(self, rewards, actions):
        # rewards should be a dict of: (i, j): r (row, col): reward
        # actions should be a dict of: (i, j): A (row, col): list of possible actions
        self.rewards = rewards
        self.actions = actions

        
    def set_state(self, s):
        self.i = s[0]
        self.j = s[1]

        
    def current_state(self):
        return (self.i, self.j)

    
    def is_terminal(self, s):
        return s not in self.actions

    
    def move(self, action):
        # check if legal move first
        if action in self.actions[(self.i, self.j)]:
            if action == 'U':
                self.i -= 1
            elif action == 'D':
                self.i += 1
            elif action == 'R':
                self.j += 1
            elif action == 'L':
                self.j -= 1
        # return a reward (if any)
        return self.rewards.get((self.i, self.j), 0)

    
    def undo_move(self, action):
        # these are the opposite of what U/D/L/R should normally do
        if action == 'U':
            self.i += 1
        elif action == 'D':
            self.i -= 1
        elif action == 'R':
            self.j -= 1
        elif action == 'L':
            self.j += 1
        # raise an exception if we arrive somewhere we shouldn't be
        # should never happen
        assert(self.current_state() in self.all_states())

    
    def game_over(self):
        # returns true if game is over, else false
        # true if we are in a state where no actions are possible
        return (self.i, self.j) not in self.actions

    
    def all_states(self):
        # possibly buggy but simple way to get all states
        # either a position that has possible next actions
        # or a position that yields a reward
        return set(self.actions.keys()) | set(self.rewards.keys())


def standard_grid():
    # define a grid that describes the reward for arriving at each state
    # and possible actions at each state
    # the grid looks like this
    # x means you can't go there
    # s means start position
    # number means reward at that state
    # .  .  .  1
    # .  x  . -1
    # s  .  .  .
    g = Grid(3, 4, (2, 0))
    rewards = {(0, 3): 1, (1, 3): -1}
    actions = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U'),
    }
    g.set(rewards, actions)
    return g


def negative_grid(step_cost=-0.1):
    # in this game we want to try to minimize the number of moves
    # so we will penalize every move
    g = standard_grid()
    g.rewards.update({
        (0, 0): step_cost,
        (0, 1): step_cost,
        (0, 2): step_cost,
        (1, 0): step_cost,
        (1, 2): step_cost,
        (2, 0): step_cost,
        (2, 1): step_cost,
        (2, 2): step_cost,
        (2, 3): step_cost,
    })
    return g

In [3]:
def print_value(V, g):
    for i in range(g.rows):
        print('----------------------------')
        for j in range(g.cols):
            v = V.get((i,j), 0)
            if v>=0:
                print('| %.2f '%v, end='')
            else:
                print('|%.2f '%v, end='') # negative sign takes up an extra space
        print('|')
    print('----------------------------')
    

def print_policy(P, g):
    for i in range(g.rows):
        print('-----------------')
        for j in range(g.cols):
            a = P.get((i,j), ' ')
            print('| %s '%a, end='')
        print('|')
    print('-----------------')

## Temporal Difference or TD0 method
advantage of this method is that it combines both dynamic and monte carlo programming
i.e. it overcomes the limitation of mc where we have to wait for an episode to finish to calculate Q value

## fixed policy or deterministic action

In [11]:
def random_action(a, eps=0.1):
    # choose given a with probability 1 - eps + eps/4
    # choose some other a' != a with probability eps/4
    p = np.random.random()
    
    # if p < (1 - eps + eps/len(ALL_POSSIBLE_ACTIONS)):
    #   return a
    # else:
    #   tmp = list(ALL_POSSIBLE_ACTIONS)
    #   tmp.remove(a)
    #   return np.random.choice(tmp)
    #
    # this is equivalent to the above
    if p<(1-eps):
        return a
    else:
        return np.random.choice(all_possible_actions)

In [10]:
# let's see how V(s) changes as we get further away from the reward
gamma = 0.9 # discount factor

# set different actions possible for a particular state
all_possible_actions = ['U', 'D', 'R', 'L']

small_enough = 1e-3

alpha = 0.1

# use the standard grid again (0 for every step) so that we can compare
# to iterative policy evaluation
grid = standard_grid()

# print rewards
print("rewards:")
print_value(grid.rewards, grid)

# state -> action
policy = {
    (2, 0): 'U',
    (1, 0): 'U',
    (0, 0): 'R',
    (0, 1): 'R',
    (0, 2): 'R',
    (1, 2): 'R',
    (2, 1): 'R',
    (2, 2): 'R',
    (2, 3): 'U',
}
    
# initialize V(s) = 0
states = grid.all_states()
V = {}
for s in states:
    V[s] = 0
    
# repeat until convergence
# play game for n episodes
for t in range(500):
    # set first state for starting position
    s = (2, 0)
    grid.set_state(s)
    # loop until one episode is over
    while not grid.game_over():
        a = policy[s]
        a = random_action(a)
        r = grid.move(a)
        # V[s] = V[s] + alpha*(r + gamma*V[s'] - V[s])
        V[s] = V[s] + alpha*(r + gamma*V[grid.current_state()] - V[s])
        # set current state as next state
        s = grid.current_state()    
    
print('policy:')
print_policy(policy, grid)

print("values:")
print_value(V, grid)

rewards:
----------------------------
| 0.00 | 0.00 | 0.00 | 1.00 |
----------------------------
| 0.00 | 0.00 | 0.00 |-1.00 |
----------------------------
| 0.00 | 0.00 | 0.00 | 0.00 |
----------------------------
policy:
-----------------
| R | R | R |   |
-----------------
| U |   | R |   |
-----------------
| U | R | R | U |
-----------------
values:
----------------------------
| 0.77 | 0.89 | 1.00 | 0.00 |
----------------------------
| 0.69 | 0.00 |-0.58 | 0.00 |
----------------------------
| 0.53 |-0.16 |-0.39 |-0.76 |
----------------------------
