# Model: GridWorld

In [4]:
# What this grid world looks like?
from __future__ import print_function

### Disadvantage: No exploration

In [5]:
print('-------------------------')
print('|     |     |     | +1  |')
print('-------------------------')
print('|     |//// |     | -1  |')
print('-------------------------')
print('|start|     |     |     |')
print('-------------------------')

-------------------------
|     |     |     | +1  |
-------------------------
|     |//// |     | -1  |
-------------------------
|start|     |     |     |
-------------------------


## Case one:  Windy GridWorld

### stocahstic transition

In [7]:
import numpy as np
from grid_world import standard_grid, negative_grid, print_values, print_policy

In [8]:
# Set parameters
SMALL_ENOUGH = 1e-3
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

#### rewards

In [22]:
grid = negative_grid()
print("rewards:")
print_values(grid.rewards, grid)

rewards:
---------------------------
-0.10| -0.10| -0.10|  1.00| 
---------------------------
-0.10|  0.00| -0.10| -1.00| 
---------------------------
-0.10| -0.10| -0.10| -0.10| 


#### initializer

In [23]:
# state -> action
policy = {}
for s in grid.actions.keys():
    policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)

# initial policy
print ("initial policy:")
print_policy(policy, grid)

# initialize V(s)
V = {}
states = grid.all_states()
for s in states:
    # V[s] = 0
    if s in grid.actions:
        V[s] = np.random.random()
    else:
        # terminal state
        V[s] = 0

initial policy:
---------------------------
  L  |   L  |   L  |      | 
---------------------------
  D  |      |   D  |      | 
---------------------------
  R  |   R  |   R  |   U  | 


#### policy iteration: Policy evaluation + policy improvement

In [25]:
while True:
    # policy evaluation step
    while True:
        biggest_change = 0
        for s in states:
            old_v = V[s]
            # V(s) only has value if it's not a terminal state
            new_v = 0
            if s in policy:
                for a in ALL_POSSIBLE_ACTIONS:
                    if a == policy[s]:
                        p = 0.5
                    else:
                        p = 0.5/3
                    grid.set_state(s)
                    r = grid.move(a)
                    new_v += p*(r + GAMMA * V[grid.current_state()])
                V[s] = new_v
                biggest_change = max(biggest_change, np.abs(old_v - V[s]))

        if biggest_change < SMALL_ENOUGH:
            break

    # policy improvement step
    is_policy_converged = True
    for s in states:
        if s in policy:
            old_a = policy[s]
            new_a = None
            best_value = float('-inf')
            # loop through all possible actions to find the best current action
            for a in ALL_POSSIBLE_ACTIONS: # chosen action
                v = 0
                for a2 in ALL_POSSIBLE_ACTIONS: # resulting action
                    if a == a2:
                        p = 0.5
                    else:
                        p = 0.5/3
                    grid.set_state(s)
                    r = grid.move(a2)
                    v += p*(r + GAMMA * V[grid.current_state()])
                if v > best_value:
                    best_value = v
                    new_a = a
            policy[s] = new_a
            if new_a != old_a:
                is_policy_converged = False

    if is_policy_converged:
        break
        
print ("values:")
print_values(V, grid)
print ("policy:")
print_policy(policy, grid)

values:
---------------------------
-0.08|  0.20|  0.55|  0.00| 
---------------------------
-0.28|  0.00| -0.06|  0.00| 
---------------------------
-0.42| -0.44| -0.33| -0.57| 
policy:
---------------------------
  R  |   R  |   R  |      | 
---------------------------
  U  |      |   U  |      | 
---------------------------
  U  |   R  |   U  |   L  | 


#### conclusion: perform bad

## Case Two: General Grid World

#### rewards

In [18]:
grid = negative_grid()
print("rewards:")
print_values(grid.rewards, grid)

rewards:
---------------------------
-0.10| -0.10| -0.10|  1.00| 
---------------------------
-0.10|  0.00| -0.10| -1.00| 
---------------------------
-0.10| -0.10| -0.10| -0.10| 


#### initializer

In [19]:
policy = {}
for s in grid.actions.keys():
    policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)

# initial policy
print ("initial policy:")
print_policy(policy, grid)

# initialize V(s)
V = {}
states = grid.all_states()
for s in states:
    # V[s] = 0
    if s in grid.actions:
        V[s] = np.random.random()
    else:
        # terminal state
        V[s] = 0

initial policy:
---------------------------
  D  |   U  |   L  |      | 
---------------------------
  U  |      |   R  |      | 
---------------------------
  R  |   D  |   R  |   U  | 


#### Policy Iteration

In [20]:
while True:
    # policy evaluation step
    while True:
        biggest_change = 0
        for s in states:
            old_v = V[s]
            # V(s) only has value if it's not a terminal state
            new_v = 0
            if s in policy:
                for a in ALL_POSSIBLE_ACTIONS:
                    if a == policy[s]:
                        p = 0.5
                    else:
                        p = 0.5/3
                    grid.set_state(s)
                    r = grid.move(a)
                    new_v += p*(r + GAMMA * V[grid.current_state()])
                V[s] = new_v
                biggest_change = max(biggest_change, np.abs(old_v - V[s]))

        if biggest_change < SMALL_ENOUGH:
            break

    # policy improvement step
    is_policy_converged = True
    for s in states:
        if s in policy:
            old_a = policy[s]
            new_a = None
            best_value = float('-inf')
            # loop through all possible actions to find the best current action
            for a in ALL_POSSIBLE_ACTIONS:
                grid.set_state(s)
                r = grid.move(a)
                v = r + GAMMA * V[grid.current_state()]
                if v > best_value:
                    best_value = v
                    new_a = a
            policy[s] = new_a
            if new_a != old_a:
                is_policy_converged = False

    if is_policy_converged:
        break
        
print ("values:")
print_values(V, grid)
print ("policy:")
print_policy(policy, grid)

values:
---------------------------
-0.08|  0.20|  0.55|  0.00| 
---------------------------
-0.28|  0.00| -0.06|  0.00| 
---------------------------
-0.42| -0.44| -0.33| -0.57| 
policy:
---------------------------
  R  |   R  |   R  |      | 
---------------------------
  U  |      |   U  |      | 
---------------------------
  U  |   R  |   U  |   L  | 


#### conclusion: much better