<a href="https://colab.research.google.com/github/DonRoboto/ReinforcementLearning_v1/blob/main/policy_iteration_v0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!pip install gymnasium

In [2]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

In [3]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False, render_mode="rgb_array")


In [4]:
action_space = env.action_space
observation_space = env.observation_space

print(action_space)
print(observation_space)

Discrete(4)
Discrete(16)


In [5]:
env.reset()

(0, {'prob': 1})

In [6]:
actions = {
    0: 'Left',
    1: 'Down',
    2: 'Right',
    3: 'Up'
}

In [7]:
policy_probs = np.full((16, 4), 0.25)
policy_probs

array([[0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25]])

In [8]:
def policy(state):
    return policy_probs[state]

In [9]:
action_probabilities = policy((0))
for action, prob in zip(range(4), action_probabilities):
    print(f"Probability of taking action {action}: {prob}")

Probability of taking action 0: 0.25
Probability of taking action 1: 0.25
Probability of taking action 2: 0.25
Probability of taking action 3: 0.25


In [10]:
state_values = np.zeros(shape=(16))
#state_values[15]=100
state_values

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [11]:
def policy_evaluation(policy_probs, state_values, theta=1e-6, gamma=0.9):
    delta = float("inf")

    while delta > theta:
        delta = 0

        for state in range(16):
            old_value = state_values[state]
            new_value = 0
            action_probabilities = policy_probs[state]

            for action, prob in enumerate(action_probabilities):
              _, next_state, reward, _ = env.unwrapped.P[state][action][0]
              new_value += prob * (reward + gamma * state_values[next_state])


            state_values[state] = new_value

            delta = max(delta, abs(old_value - new_value))


In [12]:
def policy_improvement(policy_probs, state_values, gamma=0.99):

    policy_stable = True
    for state in range(16):
        old_action = policy_probs[state].argmax()

        new_action = None
        max_qsa = float("-inf")

        for action in range(4):
          _, next_state, reward, _ = env.unwrapped.P[state][action][0]

          qsa = reward + gamma * state_values[next_state]
          if qsa > max_qsa:
            max_qsa = qsa
            new_action = action

          action_probs = np.zeros(4)
          action_probs[new_action] = 1.
          policy_probs[state] = action_probs

          if new_action != old_action:
            policy_stable = False

    return policy_stable

In [13]:
def policy_iteration(policy_probs, state_values, theta=1e-2, gamma=0.99):
    policy_stable = False

    #while not policy_stable:
    for i in range(100):

        policy_evaluation(policy_probs, state_values, theta, gamma)
        policy_stable = policy_improvement(policy_probs, state_values, gamma)


In [14]:
policy_iteration(policy_probs, state_values)

In [15]:
state_values

array([0.95099005, 0.96059601, 0.970299  , 0.96059601, 0.96059601,
       0.        , 0.9801    , 0.        , 0.970299  , 0.9801    ,
       0.99      , 0.        , 0.        , 0.99      , 1.        ,
       0.        ])

In [16]:
policy_probs

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.]])

In [24]:
state, _ = env.reset()
done = False

while not done:
  action = policy_probs[state].argmax()
  state, reward, done, _, _ = env.step(action)
  print(actions[action], reward)


Down 0.0
Down 0.0
Right 0.0
Down 0.0
Right 0.0
Right 1.0
