In [14]:
import numpy as np

import import_ipynb
from grid_world import ACTION_SPACE, standard_grid
from tabulate import tabulate

In [2]:
SMALL_ENOUGH = 1e-3

In [26]:
def print_values(V, g) :
    table = []
    for i in range(g.rows) :
        row = []
        for j in range(g.cols) :
            v = V.get((i, j), 0)
            row.append(v)
        table.append(row)
    print(tabulate(table, tablefmt="grid", floatfmt=".3f"))
        
def print_policy(P, g) :
    table = []
    for i in range(g.rows) :
        row = []
        for j in range(g.cols) :
            a = P.get((i, j), ' ')
            row.append(a)
        table.append(row)
    print(tabulate(table, tablefmt="grid")) 

In [29]:
if __name__ == '__main__' :
    #key is (s, a, s')
    #transition_probs[(s, a, s')] = p(s' | s, a)
    #any key not present will be considered to have a prob of 0
    transition_probs = {}
    
    #use deterministic rewards
    #rewards[(s, a, s')] or rewards[s']
    rewards = {}
    grid = standard_grid()
    for i in range(grid.rows) :
        for j in range(grid.cols) :
            s = (i, j)
            if not grid.is_terminal(s) :
                for a in ACTION_SPACE :
                    s2 = grid.get_next_state(s, a)
                    transition_probs[(s, a, s2)] = 1
                    if s2 in grid.rewards:
                        rewards[(s, a, s2)] = grid.rewards[s2]
                        
                        
    ###FIXED POLICY####
    policy = {
        (2, 0) : 'U',
        (1, 0) : 'U',
        (0, 0) : 'R',
        (0, 1) : 'R',
        (0, 2) : 'R',
        (1, 2) : 'U',
        (2, 1) : 'R',
        (2, 2) : 'U',
        (2, 3) : 'L',
    }
    print_policy(policy, grid)
    
    #initialize V(s) = 0
    V = {}
    for s in grid.all_states() :
        V[s] = 0
        
    gamma = 0.9 #discount factor
    it = 0
    #repeat till convergance
    while True:
        biggest_change = 0
        for s in grid.actions :
            V_old = V[s]
            V_new = 0 #this will accumulate the answer
            for a in ACTION_SPACE :
                for s2 in grid.all_states() : 
                    #action probability
                    action_prob = 1 if policy.get(s) == a else 0
                    
                    #reward
                    r = rewards.get((s, a, s2), 0)
                    
                    V_new += action_prob * transition_probs.get((s, a, s2), 0) * (r + gamma * V[s2])
            V[s] = V_new
            biggest_change = max(biggest_change, np.abs(V_old - V_new))
        print(f'Iter {it}, biggest change {biggest_change}')
        print_values(V, grid)
        it += 1
        if biggest_change < SMALL_ENOUGH :
            break
    print('\n\n')

+---+---+---+---+
| R | R | R |   |
+---+---+---+---+
| U |   | U |   |
+---+---+---+---+
| U | R | U | L |
+---+---+---+---+
Iter 0, biggest change 1.0
+-------+-------+-------+-------+
| 0.000 | 0.000 | 1.000 | 0.000 |
+-------+-------+-------+-------+
| 0.000 | 0.000 | 0.900 | 0.000 |
+-------+-------+-------+-------+
| 0.000 | 0.000 | 0.810 | 0.729 |
+-------+-------+-------+-------+
Iter 1, biggest change 0.9
+-------+-------+-------+-------+
| 0.000 | 0.900 | 1.000 | 0.000 |
+-------+-------+-------+-------+
| 0.000 | 0.000 | 0.900 | 0.000 |
+-------+-------+-------+-------+
| 0.000 | 0.729 | 0.810 | 0.729 |
+-------+-------+-------+-------+
Iter 2, biggest change 0.81
+-------+-------+-------+-------+
| 0.810 | 0.900 | 1.000 | 0.000 |
+-------+-------+-------+-------+
| 0.729 | 0.000 | 0.900 | 0.000 |
+-------+-------+-------+-------+
| 0.656 | 0.729 | 0.810 | 0.729 |
+-------+-------+-------+-------+
Iter 3, biggest change 0
+-------+-------+-------+-------+
| 0.810 | 0.900 | 1