In [1]:
import numpy as np
import grid_world_3d
import utils

In [2]:
ACTION_SPACE = (0, 1, 2, 3, 4, 5)

np.random.seed(123)

def generate_random_policy(grid):
    P = {}
    for s in grid.non_terminal_states():
        P[s] = np.random.choice(ACTION_SPACE)
    return P

def random_policy_iteration(grid, threshold = 0):
    np.random.seed(123)
    policy = {}
    while True:
        grid.set_state([2, 0, 0])
        policy = generate_random_policy(grid)
        total_reward = 0
        nsteps = 0
        while not (grid.game_over() or nsteps > 40):
            total_reward += grid.move(policy[grid.current_state()])
            nsteps += 1
        if total_reward > threshold:
            break
    return policy, total_reward

In [3]:
def best_action_value(grid, V, s, gamma = 0.9):
    action = None
    val = float('-inf')
    grid.set_state(s)
    
    for a in ACTION_SPACE:
        transitions = grid.get_transition_probs(a)
        v = 0
        r = 0
        for (P, rew, state) in transitions:
            r += P * rew
            v += P * V[state]
        v = r + gamma * v
        if v > val:
            val = v
            action = a
    
    return action, val

def value_iteration(grid, threshold = 1e-3, gamma = 0.9):
    V = {}
    for s in grid.all_states():
        V[s] = 0
    
    while True:
        delta = 0
        for s in grid.non_terminal_states():
            val = V[s]
            _, V[s] = best_action_value(grid, V, s, gamma)
            delta = max(delta, abs(val - V[s]))
            
        if delta < threshold:
            break
    
    policy = generate_random_policy(grid)
    for s in policy.keys():
        grid.set_state(s)
        policy[s], _ = best_action_value(grid, V, s, gamma)
    
    return V, policy

def run_episode(grid, policy):
    total_reward = 0
    grid.set_state((2, 0, 0))
    while not grid.game_over():
        total_reward += grid.move(policy[grid.current_state()])
        print(grid.current_state())
    return total_reward

In [4]:
myenv = grid_world_3d.standard_grid3D(obey_prob=1.0,step_cost=None)
good_policy, reward = random_policy_iteration(myenv)
print("Best Policy: {0} reward".format(reward))
print(run_episode(myenv, good_policy))

Best Policy: 1 reward
(1, 0, 0)
(1, 0, 1)
(1, 0, 2)
(1, 1, 2)
(0, 1, 2)
(0, 2, 2)
1


In [5]:
best_value, best_policy = value_iteration(myenv)
print(run_episode(myenv, best_policy))

(1, 0, 0)
(1, 1, 0)
(0, 1, 0)
(0, 2, 0)
(0, 2, 1)
(0, 2, 2)
1


In [6]:
mydrunkenenv = grid_world_3d.standard_grid3D(obey_prob=0.8,step_cost=-0.1)
good_policy, reward = random_policy_iteration(mydrunkenenv)
print("Best Policy: {0} reward".format(reward))
print(run_episode(mydrunkenenv, good_policy))

Best Policy: 1.1102230246251565e-16 reward
(2, 0, 0)
(2, 0, 0)
(2, 0, 0)
(2, 0, 0)
(2, 0, 0)
(1, 0, 0)
(1, 0, 0)
(0, 0, 0)
-1.7


In [7]:
best_value, best_policy = value_iteration(mydrunkenenv)
print(run_episode(mydrunkenenv, best_policy))

(2, 0, 1)
(1, 0, 1)
(1, 0, 2)
(0, 0, 2)
(0, 1, 2)
(0, 2, 2)
0.5


In [8]:
mysuicideenv = grid_world_3d.standard_grid3D(obey_prob=0.8,step_cost=-2)
good_policy, reward = random_policy_iteration(mysuicideenv, threshold = -8)
print("Best Policy: {0} reward".format(reward))
print(run_episode(mysuicideenv, good_policy))

Best Policy: -5 reward
(2, 1, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 1, 0)
(2, 1, 1)
(2, 1, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 1, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 1, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 0)
(2, 2, 

In [9]:
best_value, best_policy = value_iteration(mysuicideenv)
print(run_episode(mysuicideenv, best_policy))

(1, 0, 0)
(0, 0, 0)
-3
