In [1]:
%load_ext autoreload
%autoreload 2

from policy_evaluation import *
from policy_improvement import *
import numpy as np

# Day 5 - Dynamic Programming

Dynamic programming algorithms use a model of the environment to compute optimal policies. This is usually not possible in practical RL problems, but provides a solid theoretical foundation which more realistic methods approximate. DP algorithms are obtained by turning Bellman equations into update rules for value estimates.

## Policy Evaluation (Prediction)

* $Policy\ evaluation$ approximates the value function $v_\pi(s)$ of all states under a given policy $\pi$
* It achieves this through iterative updates of all the states values, using the Bellman equations as update rules:

$$
\begin{align}
v_{k+1}(s)&\doteq\mathbb E_\pi\left[R_{t+1}+\gamma v_k(S_{t+1})|S_t=s\right] \\
&=\sum_a\pi(a|s)\sum_{s',r}p(s',r|s,a)\left[r+\gamma v_k(s')\right]
\end{align}
$$
* $\{v_k\}$ can be shown to converge to $v_\pi$ under this $iterative\ policy\ evaluation$
* In practice, updating the state values in place, allowing updates of other states to use newer values, speeds up convergence
* As it only converges in the limit, a real implementation has to be halted, for example once $\underset{s\in\mathcal{S}}{\operatorname\max}|v_{k+1}(s)-v_k(s)|$ is smaller than some algorithm parameter $\theta$

### $Exercise\ \mathcal{4.1}$

#### In Example 4.1, if $\pi$ is the equiprobable random policy, what is $q_\pi(11,down)$? What is $q_\pi(7, down)$?

$$
\begin{align}
q_\pi(11, down)&=-1+v(terminal)&&=-1 \\
q_\pi(7, down)&=-1+v(11)=-1+(-14)&&=-15
\end{align}
$$

### $Exercise\ \mathcal{4.2}$

#### In Example 4.1, suppose a new state 15 is added to the gridworld just below state 13, and its actions, $\mathtt{left}$, $\mathtt{up}$, $\mathtt{right}$, and $\mathtt{down}$, take the agent to states 12, 13, 14, and 15, respectively. Assume that the transitions from the original states are unchanged. What, then, is $v_\pi(15)$ for the equiprobable random policy?

$$
\begin{align}
v_\pi(15)&=-1+\frac{v_\pi(12)+v_\pi(13)+v_\pi(14)+v_\pi(15)}{4} \\
\frac{3}{4}v_\pi(15)&=\frac{-4+v_\pi(12)+v_\pi(13)+v_\pi(14)}{4} \\
v_\pi(15)&=\frac{-4+v_\pi(12)+v_\pi(13)+v_\pi(14)}{3} \\
&=\frac{-4-22-20-14}{3} \\
&=-20
\end{align}
$$

#### Now suppose the dynamics of state 13 are also changed, such that action down from state 13 takes the agent to the new state 15. What is $v_\pi(15)$ for the equiprobable random policy in this case?

As $v_\pi(15)=v_\pi(13)$ before the change of the dynamics, $v_\pi(13)$ remains unchanged even after the dynamics change, since the value of the $\mathtt{down}$ action remains unchanged.

### $Exercise\ \mathcal{4.3}$

#### What are the equations analogous to (4.3), (4.4), and (4.5), but for action-value functions instead of state-value functions?

$$
\begin{align}
q_\pi(s,a)&\doteq\mathbb E_\pi\left[G_t|S_t=s,A_t=a\right] \\
&=\mathbb E_\pi\left[R_{t+1}+\gamma q_\pi(S_{t+1}|A_{t+1})|S_t=s,A_t=a\right] \\
&=\sum_{s',r}p(s',r|s,a)\left[r+\gamma\sum_{a'}q_\pi(s',a')\right] \\
q_{k+1}(s,a)&\doteq\mathbb E_\pi\left[R_{t+1}+\gamma q_k(S_{t+1}|A_{t+1})|S_t=s,A_t=a\right] \\
&=\sum_{s',r}p(s',r|s,a)\left[r+\gamma\sum_{a'}q_k(s',a')\right] \\
\end{align}
$$

In [2]:
num_states = 15
num_actions = 4
transitions = np.zeros((num_states, num_states, num_actions))
for s in range(num_states):
    up, right, down, left = s-4, s+1, s+4, s-1
    if s == 0:
        up, right, down, left = 0, 0, 0, 0
    if s in [1, 2, 3]:
        up = s
    if s in [3, 7, 11]:
        right = s
    if s in [12, 13, 14]:
        down = s
    if s in [4, 8, 12]:
        left = s
    if s == 14:
        right = 0
    if s == 11:
        down = 0
    transitions[s,up,0] = 1
    transitions[s,right,1] = 1
    transitions[s,down,2] = 1
    transitions[s,left,3] = 1
rewards = np.ones((num_states, num_states, num_actions)) * -1.0
rewards[0,:,:] = 0.0
discount = 1.0
policy = np.ones((num_states, num_actions)) / 4.0 # Order: up, right, down, left

evaluator = PolicyEvaluation(transitions, rewards, discount, policy)

evaluator.evaluate(min_delta=1/(10**100))

values = np.zeros((num_states+1,1))
values[:num_states,:] = evaluator.values
values[num_states,:] = evaluator.values[0]
values.reshape(4,4)

array([[  0., -14., -20., -22.],
       [-14., -18., -20., -20.],
       [-20., -20., -18., -14.],
       [-22., -20., -14.,   0.]])

In [3]:
num_states = 16
num_actions = 4
transitions = np.zeros((num_states, num_states, num_actions))
for s in range(num_states):
    up, right, down, left = s-4, s+1, s+4, s-1
    if s == 0:
        up, right, down, left = 0, 0, 0, 0
    if s in [1, 2, 3]:
        up = s
    if s in [3, 7, 11]:
        right = s
    if s in [12, 13, 14]:
        down = s
    if s in [4, 8, 12]:
        left = s
    if s == 14:
        right = 0
    if s == 11:
        down = 0
    if s == 13:
        down = 15
    if s == 15:
        up = 13
        right = 14
        down = 15
        left = 12
    transitions[s,up,0] = 1
    transitions[s,right,1] = 1
    transitions[s,down,2] = 1
    transitions[s,left,3] = 1
rewards = np.ones((num_states, num_states, num_actions)) * -1.0
rewards[0,:,:] = 0.0
discount = 1.0
policy = np.ones((num_states, num_actions)) / 4.0 # Order: up, right, down, left

evaluator = PolicyEvaluation(transitions, rewards, discount, policy)

evaluator.evaluate(min_delta=1/(10**100))

values = np.zeros((num_states+4,1))
values[:num_states,:] = evaluator.values.copy()
values[num_states-1,:] = evaluator.values[0]
values[17] = evaluator.values[15]
values.reshape(5,4)

array([[  0., -14., -20., -22.],
       [-14., -18., -20., -20.],
       [-20., -20., -18., -14.],
       [-22., -20., -14.,   0.],
       [  0., -20.,   0.,   0.]])

## Policy Improvement

*

In [4]:
num_states = 16
num_actions = 4
transitions = np.zeros((num_states, num_states, num_actions))
for s in range(num_states):
    up, right, down, left = s-4, s+1, s+4, s-1
    if s == 0:
        up, right, down, left = 0, 0, 0, 0
    if s in [1, 2, 3]:
        up = s
    if s in [3, 7, 11]:
        right = s
    if s in [12, 13, 14]:
        down = s
    if s in [4, 8, 12]:
        left = s
    if s == 14:
        right = 0
    if s == 11:
        down = 0
    if s == 13:
        down = 15
    if s == 15:
        up = 13
        right = 14
        down = 15
        left = 12
    transitions[s,up,0] = 1
    transitions[s,right,1] = 1
    transitions[s,down,2] = 1
    transitions[s,left,3] = 1
rewards = np.ones((num_states, num_states, num_actions)) * -1.0
rewards[0,:,:] = 0.0
discount = 1.0
policy = np.ones((num_states, num_actions)) / 4.0 # Order: up, right, down, left

evaluator = PolicyImprovement(transitions, rewards, discount, policy)

evaluator.evaluate(min_delta=1/(10**100))
evaluator.improve_policy()
evaluator.evaluate(min_delta=1/(10**100))

values = np.zeros((num_states+4,1))
values[:num_states,:] = evaluator.values.copy()
values[num_states-1,:] = evaluator.values[0]
values[17] = evaluator.values[15]
values.reshape(5,4)

array([[ 0., -1., -2., -3.],
       [-1., -2., -3., -2.],
       [-2., -3., -2., -1.],
       [-3., -2., -1.,  0.],
       [ 0., -2.,  0.,  0.]])