<a href="https://colab.research.google.com/github/4nands/Perceptron/blob/main/404_DP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import numpy as np

class GridWorld:

  def __init__(self):

    self.map = np.array([
        [0,0,0,1],
        [0,0,0,-1],
        [0,None,0,0],
        [0,0,0,0]
    ])
    self.h = self.map.shape[0]
    self.w = self.map.shape[1]
    self.states = np.array([[i,j] for i in range(self.h) for j in range(self.w)])
    self.action = np.array([[-1,0],[1,0],[0,-1],[0,1]])
    self.direct = ['↑','↓','←','→']

  def reward(self,s):
    return self.map[*s]

  def isGoal(self,s):
    return all(s==[0,3])

  def isWall(self,s):
    return self.reward(s) == None

  def move(self,s,a):

    s_ = s + a
    i,j = s_

    if i < 0 or i >= self.h:
      s_ = s
    elif j < 0 or j >= self.w:
      s_ = s
    elif self.isWall(s_):
      s_ = s

    return s_

  def get_map(self):

    m = np.zeros_like(self.map)

    for i in range(m.shape[0]):

      for j in range(m.shape[1]):

        n = self.map[i,j]

        if n == 1:
          c = '〇'
        elif n == -1:
          c = '×'
        elif n == None:
          c = '■'
        else:
          c = '□'

        m[i,j] = c

    return m

def disp_map(map):
  for r in map:
    s = ''
    for c in r:
      s += c
    print(s)

class DPAgent:

  def __init__(self,env):

    self.env = env
    self.V = np.zeros_like(self.env.map)
    self.pi = np.zeros_like(self.env.map)

  def Q(self,s,a):

    s_ = env.move(s,a)

    return self.env.reward(s_) + 0.9*self.V[*s_]

  def value_iter(self):

    delta = 1000

    while delta > 1e-4:

      V_ = self.V.copy()

      for s in env.states:

        values = []

        if env.isGoal(s):
          self.V[*s] = 0
          continue

        for a in env.action:

          values += [self.Q(s,a)]

        self.V[*s] = float(np.max(values))
      delta = np.max(np.abs(self.V-V_))


  def greedy_policy(self):

    for s in env.states:

      values = []

      for a in env.action:

          values += [self.Q(s,a)]

      self.pi[*s] = int(np.argmax(values))

  def policy(self,s):

    return self.pi[*s]

env = GridWorld()
agent = DPAgent(env)
agent.value_iter()
agent.greedy_policy()


map=env.get_map()

ini = [3,0]
s = np.array(ini)

while not env.isGoal(s):
  p = agent.policy(s)
  d = env.direct[p]
  map[*s] = d
  a = env.action[p]
  s = env.move(s,a)

disp_map(map)




→→→〇
↑□□×
↑■□□
↑□□□
