## What we implement

$$
v(s) = R_s + \gamma \sum_{s' \in S} P_{ss'}V(s')
$$

In [None]:


!pip install -e git+https://github.com/star-ai/rl-environments.git#egg=rlenvs
!pip install gym

In [31]:
!pip install -e git+https://github.com/star-ai/rl-environments.git#egg=rlenvs
!pip install gym

Obtaining rlenvs from git+https://github.com/star-ai/rl-environments.git#egg=rlenvs
  Cloning https://github.com/star-ai/rl-environments.git to ./src/rlenvs
  Running command git clone -q https://github.com/star-ai/rl-environments.git /Users/alekseyleshchankin/Code/src/rlenvs
  Resolved https://github.com/star-ai/rl-environments.git to commit 0ae7fca7685ff5694a94db97798b9f0259eb4c53
Installing collected packages: rlenvs
  Running setup.py develop for rlenvs
Successfully installed rlenvs-0.1
You should consider upgrading via the '/Users/alekseyleshchankin/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [1]:
from IPython.core.debugger import set_trace
import numpy as np
import pprint

# Import below can all of a sudden break
from src.rlenvs.rlenvs.envs.gridworld import GridworldEnv

In [2]:
pp = pprint.PrettyPrinter(indent=2)
env = GridworldEnv()
#print (env.P)

In [8]:
#  Calculate state value given policy, state, and current state value function.
def calculate_state_value (policy, state, env, V, discount_factor):
    v = 0
    for a, action_prob in enumerate (policy[state]):
        for prob, next_state, reward, done in env.P[state][a]:
            v += action_prob * prob * (reward + discount_factor * V[next_state])
    return v

In [9]:
# Run a full sweep over states.
def run_full_sweep(policy, env, V, discount_factor):
    new_V = np.zeros(env.nS)
    delta = 0
    
    for s in range(env.nS):
        v = calculate_state_value (policy, s, env, V, discount_factor)       
        delta = max (delta, np.abs (v - V[s]))
        new_V [s] = v
    return new_V, delta

In [10]:
# Evaluate a policy
def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):
    V = np.zeros(env.nS)
    
    while True:
        V, delta = run_full_sweep (policy, env, V, discount_factor)
        if delta < theta:
            break
    return np.array(V)

In [11]:
random_policy = np.ones([env.nS, env.nA]) / env.nA

In [12]:
v = policy_eval(random_policy, env)

In [13]:
pp = pprint.PrettyPrinter(indent=2)
print("Value Function:")
pp.pprint(np.reshape(v, (4, 4)))

Value Function:
array([[  0.        , -13.99989315, -19.99984167, -21.99982282],
       [-13.99989315, -17.99986052, -19.99984273, -19.99984167],
       [-19.99984167, -19.99984273, -17.99986052, -13.99989315],
       [-21.99982282, -19.99984167, -13.99989315,   0.        ]])


In [14]:
# Test: Make sure the evaluated policy is what we expected
expected_v = np.array([0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14,
                       -22, -20, -14, 0])
print('Expected')
pp.pprint(np.reshape(expected_v, (4,4)))
print()
np.testing.assert_array_almost_equal(v, expected_v, decimal=2)
print('Test passed')

Expected
array([[  0, -14, -20, -22],
       [-14, -18, -20, -20],
       [-20, -20, -18, -14],
       [-22, -20, -14,   0]])

Test passed


In [None]:
# One function

In [46]:
def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):
    # Start with a random (all 0) value function
    V = np.zeros(env.nS)
    while True:
        delta = 0
        # For each state, perform a "full backup"
        for s in range(env.nS):
            v = 0
            # Look at the possible next actions
            for a, action_prob in enumerate(policy[s]):
                # For each action, look at the possible next states...
                for  prob, next_state, reward, done in env.P[s][a]:
                    # Calculate the expected value
                    v += action_prob * prob * (reward + discount_factor * V[next_state])
            # How much our value function changed (across any states)
            delta = max(delta, np.abs(v - V[s]))
            V[s] = v
        # Stop evaluating once our value function change is below a threshold
        if delta < theta:
            break
    return np.array(V)

In [47]:
random_policy = np.ones([env.nS, env.nA]) / env.nA
v = policy_eval(random_policy, env)

In [48]:
print("Value Function:")
print(v)
print("")

print("Reshaped Grid Value Function:")
print(v.reshape(env.shape))
print("")

Value Function:
[  0.         -13.99993529 -19.99990698 -21.99989761 -13.99993529
 -17.9999206  -19.99991379 -19.99991477 -19.99990698 -19.99991379
 -17.99992725 -13.99994569 -21.99989761 -19.99991477 -13.99994569
   0.        ]

Reshaped Grid Value Function:
[[  0.         -13.99993529 -19.99990698 -21.99989761]
 [-13.99993529 -17.9999206  -19.99991379 -19.99991477]
 [-19.99990698 -19.99991379 -17.99992725 -13.99994569]
 [-21.99989761 -19.99991477 -13.99994569   0.        ]]

