# Set up

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt

env = gym.make('CliffWalking-v0')

In [6]:
env.P[1][1]

[(1.0, 2, -1, False)]

# Value Iteration Agent

In [2]:
class ValueIteration:
  def __init__(self,gamma=1,theta=0.0001):
    self.gamma = gamma
    #theta is an approximation error threshold
    self.theta = theta

  def best_action(self, V, state):
    action_value_list = []
    for action in range(len(env.P[state])):
      prob, next_state, r, _ = env.P[state][action][0] #as the next state is deterministic for a state-action pair
      action_value = r + self.gamma * V[next_state]
      action_value_list.append(action_value)
    best_action = np.argmax(action_value_list)
    return best_action

  def update_V(self, V, state):
    best_action = self.best_action(V, state)
    prob, next_state, r, _ = env.P[state][best_action][0]
    new_state_value = r + self.gamma * V[next_state]
    if np.abs(new_state_value - V[state]) < self.theta: #NEED TO UNDERSTAND MAX IN PSEUDO CODE
      V[state] = new_state_value
      V[-1] = 0
    return V
  
  def learning(self, max_iterations=100):
    V = np.random.rand(env.observation_space.n)
    V[-1] = 0 

    for i in range(max_iterations):
      for state in range(env.nS):
        V = self.update_V(V, state)

    return V
  
  #ADD GETTING BEST POLICY METHOD HERE

# Training and getting best policy

In [3]:
v1 = ValueIteration()
V = v1.learning()


In [6]:
V

array([0.44171836, 0.99730245, 0.95671824, 0.85381545, 0.20956625,
       0.68512131, 0.39925545, 0.12818931, 0.66283493, 0.0019761 ,
       0.65635002, 0.63798759, 0.42473372, 0.72875108, 0.87698964,
       0.88268443, 0.83334779, 0.15513089, 0.94874593, 0.50024069,
       0.12219702, 0.28564563, 0.29649485, 0.95859908, 0.45430731,
       0.45441739, 0.71811067, 0.73073689, 0.87797472, 0.13172178,
       0.60180999, 0.32522459, 0.75777332, 0.9349663 , 0.83380454,
       0.7881579 , 0.69176861, 0.89605017, 0.73675699, 0.70133571,
       0.54588036, 0.91683476, 0.29440449, 0.08115225, 0.13427836,
       0.74687797, 0.39734071, 0.        ])

In [8]:
best_action_list = []
for state in range(env.nS):
  action_value_list = []
  for action in range(len(env.P[state])):
    prob, next_state, r, _ = env.P[state][action][0] 
    action_value = r + 1 * V[next_state]
    action_value_list.append(action_value)
    best_action = np.argmax(action_value_list)
  best_action_list.append(best_action)
    

In [11]:
import pandas as pd
actions = ["^", ">", "v", "<"]
policy_arrows = [actions[i] for i in best_action_list]
pd.DataFrame(np.array(policy_arrows).reshape(4,12))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,>,^,<,<,<,^,v,>,^,<,^,v
1,>,^,^,<,<,>,v,<,v,v,>,>
2,v,^,^,^,^,<,^,>,>,>,<,^
3,v,<,^,^,^,^,^,^,^,^,^,^
