# Set up

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt

env = gym.make('CliffWalking-v0')

# Value Iteration Agent

In [3]:
class ValueIteration:
  def __init__(self,gamma=1,theta=0.0001):
    self.gamma = gamma
    #theta is an approximation error threshold
    self.theta = theta

  def best_action(self, V, state):
    action_value_list = []
    for action in range(len(env.P[state])):
      prob, next_state, r, _ = env.P[state][action][0] #as the next state is deterministic for a state-action pair
      action_value = r + self.gamma * V[next_state]
      action_value_list.append(action_value)
    best_action = np.argmax(action_value_list)
    return best_action

  def update_V(self, V, state):
    best_action = self.best_action(V, state)
    prob, next_state, r, _ = env.P[state][best_action][0]
    new_state_value = r + self.gamma * V[next_state]
    return new_state_value
  
  def learning(self, max_iterations=100):
    V = np.random.rand(env.observation_space.n)
    V[-1] = 0 

    for i in range(max_iterations):
      V_new = []
      V_diff = []
      for state in range(env.nS):
        new_state_value = self.update_V(V, state)
        value_diff = np.abs(new_state_value - V[state])
        V_new.append(new_state_value)
        V_diff.append(value_diff)
      
      if np.max(V_diff) > self.theta: 
        V = V_new
        V[-1] = 0
    return V
  
  #ADD GETTING BEST POLICY METHOD HERE

# Training and getting best policy

In [4]:
v1 = ValueIteration()
V = v1.learning()


In [5]:
V

[-14,
 -13,
 -12,
 -11,
 -10,
 -9,
 -8,
 -7,
 -6,
 -5,
 -4,
 -3,
 -13,
 -12,
 -11,
 -10,
 -9,
 -8,
 -7,
 -6,
 -5,
 -4,
 -3,
 -2,
 -12,
 -11,
 -10,
 -9,
 -8,
 -7,
 -6,
 -5,
 -4,
 -3,
 -2,
 -1,
 -13,
 -12,
 -11,
 -10,
 -9,
 -8,
 -7,
 -6,
 -5,
 -4,
 -1,
 0]

In [6]:
best_action_list = []
for state in range(env.nS):
  action_value_list = []
  for action in range(len(env.P[state])):
    prob, next_state, r, _ = env.P[state][action][0] 
    action_value = r + 1 * V[next_state]
    action_value_list.append(action_value)
    best_action = np.argmax(action_value_list)
  best_action_list.append(best_action)
    

In [7]:
import pandas as pd
actions = ["^", ">", "v", "<"]
policy_arrows = [actions[i] for i in best_action_list]
pd.DataFrame(np.array(policy_arrows).reshape(4,12))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,>,>,>,>,>,>,>,>,>,>,>,v
1,>,>,>,>,>,>,>,>,>,>,>,v
2,>,>,>,>,>,>,>,>,>,>,>,v
3,^,^,^,^,^,^,^,^,^,^,>,>
