<div style="display:float">
  <h1 style="margin-top: 36px; font-size: 32px; float:left">Reinforcement Learning: Value Iteration</h1>
    <img style="vertical-align:middle; float:right" src="http://gntlearning.com/wp-content/uploads/2021/05/Small_Logo-278x64.jpg" width=250px>
</div>

In [1]:
import numpy as np
import gym

In [2]:
env = gym.make("FrozenLake-v0")
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


## AI Gym Basics

In [3]:
env.reset()
for i in range(5):
    a = env.action_space.sample()
    s_, reward, done, _ = env.step(a)
    env.render()
    if done:
        break

  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG


In [9]:
env.P[1][1]

[(0.3333333333333333, 0, 0.0, False),
 (0.3333333333333333, 5, 0.0, True),
 (0.3333333333333333, 2, 0.0, False)]

## Value Iteration

$s$ is a state

$a$ is an action take from state $s$

$s'$ is the next state

$a'$ is an action take from state $s'$  (not used today, just common notation)

$V(s)$ is the value of a state

$T(s,a,s')$ is the probability of going to state $s'$ when taking action $a$ from state $s$

$R(s,a,s')$ is the reward for going from $s$ to $s'$ via $a$

### $V(s) = max_a \sum_{s'} T(s,a,s')\left[ R(s,a,s') + \gamma  V(s')  \right]$

In [10]:
num_states = env.observation_space.n
num_actions = env.action_space.n
gamma = 0.9

In [12]:
V = np.zeros(num_states)
for i in range(20):
    for s in range(num_states):
        q = np.zeros(num_actions)
        for a in range(num_actions):
    #         for s_ in range(num_states):
    #             q[a] += T[s,a,s_]*(R[s,a,s_]+gamma*V[s_])
            for T, s_, R, _ in env.P[s][a]:
                q[a] += T*(R+gamma*V[s_])
        V[s] = max(q)
V.reshape(4,4)

array([[0.06091716, 0.05563938, 0.07077423, 0.05193238],
       [0.08565174, 0.        , 0.11056674, 0.        ],
       [0.14111297, 0.24505789, 0.29811122, 0.        ],
       [0.        , 0.37833073, 0.63825638, 0.        ]])

In [14]:
V = np.zeros(num_states)
policy = np.zeros(num_states)
V_old = -1*np.ones(num_states)
while (np.abs(V-V_old)).max() > 1e-3:
    V_old = V.copy()
    for s in range(num_states):
        q = np.zeros(num_actions)
        for a in range(num_actions):
            for T, s_, R, _ in env.P[s][a]:
                q[a] += T*(R+gamma*V_old[s_])
        V[s] = max(q)
        policy[s] = np.argmax(q)
        
print(policy.reshape(4,4))
V.reshape(4,4)

[[0. 3. 0. 3.]
 [0. 0. 0. 0.]
 [3. 1. 0. 0.]
 [0. 2. 1. 0.]]


array([[0.06253203, 0.05606969, 0.07051342, 0.05159888],
       [0.08603505, 0.        , 0.11007757, 0.        ],
       [0.14066756, 0.24422494, 0.29731176, 0.        ],
       [0.        , 0.3775274 , 0.6377414 , 0.        ]])

In [15]:
action_names = ['<', 'v', '>', '^']
env.render()
np.array([action_names[int(i)] for i in policy]).reshape(4,4)

  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG


array([['<', '^', '<', '^'],
       ['<', '<', '<', '<'],
       ['^', 'v', '<', '<'],
       ['<', '>', 'v', '<']], dtype='<U1')