<a href="https://colab.research.google.com/github/omaremad02/Markov-Decision-Process/blob/main/Value_iteration_and_Policy_Iteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook includes implementations of the following algorithms:


*   Value Iteration Algorithm
*   Policy Iteration Algorithm

The notebook also includes a test gridworld game where the two algorithms are implemented to extract the optimal policy for the agent and the optimal value function for each state.

Below is the commented implementation where each section is in a seperate notebook.


In [1]:
import numpy as np

In [10]:
class gridworld:

  def __init__(self, grid_size):
    self.grid_size = grid_size
    rewards = -(np.ones((grid_size, grid_size)))
    rewards[0,2] = 10
    self.rewards = rewards
    self.actions = ["U", "D", "L", "R"]
    self.action_prob = {'U': (0.8, 0.1, 0.1), 'D': (0.8, 0.1, 0.1),
               'L': (0.8, 0.1, 0.1), 'R': (0.8, 0.1, 0.1)}

  def next_state(self, state, action):
    x, y = state
    if action == 'U':
        return [(x-1, y), (x, y-1), (x, y+1)]
    elif action == 'D':
        return [(x+1, y), (x, y-1), (x, y+1)]
    elif action == 'L':
        return [(x, y-1), (x-1, y), (x+1, y)]
    elif action == 'R':
        return [(x, y+1), (x-1, y), (x+1, y)]
    return [state, state, state]

  def is_valid(self, state):
      x,y = state
      return 0 <= x < self.grid_size and 0 <= y < self.grid_size

In [16]:
class agent_algorithms:
  def __init__(self, grid: gridworld):
    self.grid = grid
    self.discount_factor = 0.99 # takes the future highly in consideration.


  def value_iteration(self):
    state_values = np.zeros((3,3)) #initializng the value function to zero.
    while True:
      delta = 0
      for row in range(0,3):
        for col in range(0,3):
          max_value = float("-inf")
          for action in self.grid.actions:
            value = 0
            for prob, new_state in zip(self.grid.action_prob[action], self.grid.next_state((row,col), action)):
              x1,y1 = new_state
              if self.grid.is_valid(new_state):
                value += prob * self.discount_factor * state_values[x1,y1]
              else:
                value += prob * self.discount_factor * state_values[row,col]
            if value > max_value:
              state_values[row,col] = value
              delta = max(delta, abs(value - state_values[row, col]))
      if delta < 1e-4:
            break
    return state_values

Test

In [18]:
reward_list = [100,3,0,-3]
grid = gridworld(grid_size= 3)
for i in range(4):
  grid.rewards[0,0] = reward_list[i]
  agent = agent_algorithms(grid)
  result = agent.value_iteration()
  print(result)
  print()

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]

