In [1]:
import numpy as np

import import_ipynb
from grid_world import ACTION_SPACE, windy_grid
from tabulate import tabulate

importing Jupyter notebook from grid_world.ipynb


In [6]:
SMALL_ENOUGH = 1e-3

In [7]:
def print_values(V, g) :
    table = []
    for i in range(g.rows) :
        row = []
        for j in range(g.cols) :
            v = V.get((i, j), 0)
            row.append(v)
        table.append(row)
    print(tabulate(table, tablefmt="grid", floatfmt=".3f"))
        
def print_policy(P, g) :
    table = []
    for i in range(g.rows) :
        row = []
        for j in range(g.cols) :
            a = P.get((i, j), ' ')
            row.append(a)
        table.append(row)
    print(tabulate(table, tablefmt="grid")) 

In [9]:
if __name__ == '__main__' :
    #key is (s, a, s')
    #transition_probs[(s, a, s')] = p(s' | s, a)
    #any key not present will be considered to have a prob of 0
    transition_probs = {}
    
    #use deterministic rewards
    #rewards[(s, a, s')] or rewards[s']
    rewards = {}
    grid = windy_grid()
    for (s, a), v in grid.probs.items() : #(i, j) = s, #v = {s2 : p}
        for s2, p in v.items() :
            transition_probs[(s, a, s2)] = p
            rewards[(s, a, s2)] = grid.rewards.get(s2, 0)                       
                        
    ###PROB POLICY####
    policy = {
        (2, 0) : {'U' : 0.5, 'R' : 0.5},
        (1, 0) : {'U' : 1.0},
        (0, 0) : {'R' : 1.0},
        (0, 1) : {'R' : 1.0},
        (0, 2) : {'R' : 1.0},
        (1, 2) : {'U' : 1.0},
        (2, 1) : {'R' : 1.0},
        (2, 2) : {'U' : 1.0},
        (2, 3) : {'L' : 1.0},
    }
    print_policy(policy, grid)
    
    #initialize V(s) = 0
    V = {}
    for s in grid.all_states() :
        V[s] = 0
        
    gamma = 0.9 #discount factor
    it = 0
    #repeat till convergance
    while True:
        biggest_change = 0
        for s in grid.actions :
            V_old = V[s]
            V_new = 0 #this will accumulate the answer
            for a in ACTION_SPACE :
                for s2 in grid.all_states() : 
                    #action probability
                    action_prob = policy[s].get(a, 0)
                    
                    #reward
                    r = rewards.get((s, a, s2), 0)
                    
                    V_new += action_prob * transition_probs.get((s, a, s2), 0) * (r + gamma * V[s2])
            V[s] = V_new
            biggest_change = max(biggest_change, np.abs(V_old - V_new))
        print(f'Iter {it}, biggest change {biggest_change}')
        print_values(V, grid)
        it += 1
        if biggest_change < SMALL_ENOUGH :
            break
    print('\n\n')

+----------------------+------------+------------+------------+
| {'R': 1.0}           | {'R': 1.0} | {'R': 1.0} |            |
+----------------------+------------+------------+------------+
| {'U': 1.0}           |            | {'U': 1.0} |            |
+----------------------+------------+------------+------------+
| {'U': 0.5, 'R': 0.5} | {'R': 1.0} | {'U': 1.0} | {'L': 1.0} |
+----------------------+------------+------------+------------+
Iter 0, biggest change 1.0
+-------+-------+--------+--------+
| 0.000 | 0.000 |  1.000 |  0.000 |
+-------+-------+--------+--------+
| 0.000 | 0.000 | -0.050 |  0.000 |
+-------+-------+--------+--------+
| 0.000 | 0.000 | -0.045 | -0.040 |
+-------+-------+--------+--------+
Iter 1, biggest change 0.9
+-------+--------+--------+--------+
| 0.000 |  0.900 |  1.000 |  0.000 |
+-------+--------+--------+--------+
| 0.000 |  0.000 | -0.050 |  0.000 |
+-------+--------+--------+--------+
| 0.000 | -0.040 | -0.045 | -0.040 |
+-------+--------+------