In [1]:
import numpy as np

## environment class

In [8]:
class Grid:
    def __init__(self, rows, cols, start):
        self.rows = rows
        self.cols = cols
        self.i = start[0]
        self.j = start[1]

        
    def set(self, rewards, actions):
        # rewards should be a dict of: (i, j): r (row, col): reward
        # actions should be a dict of: (i, j): A (row, col): list of possible actions
        self.rewards = rewards
        self.actions = actions

        
    def set_state(self, s):
        self.i = s[0]
        self.j = s[1]

        
    def current_state(self):
        return (self.i, self.j)

    
    def is_terminal(self, s):
        return s not in self.actions

    
    def move(self, action):
        # check if legal move first
        if action in self.actions[(self.i, self.j)]:
            if action == 'U':
                self.i -= 1
            elif action == 'D':
                self.i += 1
            elif action == 'R':
                self.j += 1
            elif action == 'L':
                self.j -= 1
        # return a reward (if any)
        return self.rewards.get((self.i, self.j), 0)

    
    def undo_move(self, action):
        # these are the opposite of what U/D/L/R should normally do
        if action == 'U':
            self.i += 1
        elif action == 'D':
            self.i -= 1
        elif action == 'R':
            self.j -= 1
        elif action == 'L':
            self.j += 1
        # raise an exception if we arrive somewhere we shouldn't be
        # should never happen
        assert(self.current_state() in self.all_states())

    
    def game_over(self):
        # returns true if game is over, else false
        # true if we are in a state where no actions are possible
        return (self.i, self.j) not in self.actions

    
    def all_states(self):
        # possibly buggy but simple way to get all states
        # either a position that has possible next actions
        # or a position that yields a reward
        return set(self.actions.keys()) | set(self.rewards.keys())


def standard_grid():
    # define a grid that describes the reward for arriving at each state
    # and possible actions at each state
    # the grid looks like this
    # x means you can't go there
    # s means start position
    # number means reward at that state
    # .  .  .  1
    # .  x  . -1
    # s  .  .  .
    g = Grid(3, 4, (2, 0))
    rewards = {(0, 3): 5, (1, 3): -5}
    actions = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U'),
    }
    g.set(rewards, actions)
    return g


def negative_grid(step_cost=-0.1):
    # in this game we want to try to minimize the number of moves
    # so we will penalize every move
    g = standard_grid()
    g.rewards.update({
        (0, 0): step_cost,
        (0, 1): step_cost,
        (0, 2): step_cost,
        (1, 0): step_cost,
        (1, 2): step_cost,
        (2, 0): step_cost,
        (2, 1): step_cost,
        (2, 2): step_cost,
        (2, 3): step_cost,
    })
    return g

In [3]:
def print_value(V, g):
    for i in range(g.rows):
        print('----------------------------')
        for j in range(g.cols):
            v = V.get((i,j), 0)
            if v>=0:
                print('| %.2f '%v, end='')
            else:
                print('|%.2f '%v, end='') # negative sign takes up an extra space
        print('|')
    print('----------------------------')
    

def print_policy(P, g):
    for i in range(g.rows):
        print('-----------------')
        for j in range(g.cols):
            a = P.get((i,j), ' ')
            print('| %s '%a, end='')
        print('|')
    print('-----------------')

## non deterministic approach or windy approach
it means that what if environment doesn't allow agent to take the actions based on the given policy due to other external factors. This is called as windy problem <br>
we will work on improving the policy as well here

In [13]:
# threshold for convergence
SMALL_ENOUGH = 10e-4

# let's see how V(s) changes as we get further away from the reward
gamma = 0.9 # discount factor

# set different actions possible for a particular state
all_possible_actions = ['U', 'D', 'R', 'L']

# define grid in which all the steps will have a small negative reward
# here if I increase the step_cost to -1 but keep the rewards to +1 and -1 only then when agent tries
# to optimize the reward then it may end up in the loosing state as in that way overall rewards is
# higher than what it will receive to reach to the winning state
# hence terminal rewards should be such that it dominates all other state rewards
# so increase the rewards from +/- 1 to +/-5 and will end up in winning state
grid = negative_grid(step_cost=-1)

# states will be positions (i,j)
# simpler than tic-tac-toe because we only have one "game piece"
# that can only be at one position at a time
states = grid.all_states()

# defining a random policy for each state
policy = {}
for s in grid.actions.keys():
    policy[s] = np.random.choice(all_possible_actions)
    
print('initial policy:')
print_policy(policy, grid)

# initialize V(s) = 0
V = {}
for s in states:
    V[s] = 0
    
print("values for initial policy:")
print_value(V, grid)

# repeat until convergence - it will break when policy doesn't change
# once policy doesn't change V value will also become constant
while True:
    # repeat until convergenece  for V
    while True:
        biggest_change = 0
        for s in states:
            old_v = V[s]
            # V(s) only has value which is not 0 if it's not a terminal state
            if s in policy:
                # until now the action was deterministic so probability p=1  for the policy action
                # and probability was 0 for other actions 
                # but now we are considering the probability of action in policy to be 0.5
                # and rest of the actions have equal probability 0.5/3
                new_v = 0
                for a in all_possible_actions:
                    if a == policy[s]:
                        p = 0.5
                    else:
                        p = 0.5/3
                    grid.set_state(s)
                    r = grid.move(a)
                    new_v += p*(r + gamma * V[grid.current_state()])
                V[s] = new_v
                biggest_change = max(biggest_change, np.abs(old_v-V[s]))

        if biggest_change < SMALL_ENOUGH:
            break
            
    # policy improvement step
    is_policy_converged = True
    for s in states:
        if s in policy:
            old_a = policy[s]
            new_a = None
            best_v = float('-inf')
            # loop through all possible actions to get the best
            for action in all_possible_actions:
                v=0
                for a2 in all_possible_actions:
                    if a2==action:
                        p=0.5
                    else:
                        p=0.5/3
                    grid.set_state(s)
                    r = grid.move(action)
                    v += p*(r + gamma * V[grid.current_state()])
                if v > best_v:
                    best_v = v
                    new_a = action
            policy[s] = new_a
            if new_a != old_a:
                is_policy_converged = False
    if is_policy_converged:
        break

print('optimized policy:')
print_policy(policy, grid)

print("values for optimized policy:")
print_value(V, grid)


initial policy:
-----------------
| U | U | D |   |
-----------------
| D |   | R |   |
-----------------
| R | D | D | L |
-----------------
values for initial policy:
----------------------------
| 0.00 | 0.00 | 0.00 | 0.00 |
----------------------------
| 0.00 | 0.00 | 0.00 | 0.00 |
----------------------------
| 0.00 | 0.00 | 0.00 | 0.00 |
----------------------------
optimized policy:
-----------------
| R | R | R |   |
-----------------
| U |   | U |   |
-----------------
| U | R | U | L |
-----------------
values for optimized policy:
----------------------------
|-2.90 |-0.81 | 1.93 | 0.00 |
----------------------------
|-4.44 | 0.00 |-1.60 | 0.00 |
----------------------------
|-5.35 |-4.99 |-3.75 |-4.79 |
----------------------------
