<a href="https://colab.research.google.com/github/4nands/Perceptron/blob/main/405_MC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [67]:
import numpy as np
from collections import defaultdict

class GridWorld:
    def __init__(self):

        self.action_space = [0, 1, 2, 3]

        self.map = np.array(
            [[0, 0, 0, 1.0],
             [0, None, 0, -1.0],
             [0, 0, 0, 0]]
        )
        self.goal_state = (0, 3)
        self.wall_state = (1, 1)
        self.start_state = (2, 0)
        self.agent_state = self.start_state
        self.h = self.map.shape[0]
        self.w = self.map.shape[1]
        self.states = [(i,j) for i in range(self.h) for j in range(self.w)]

    def Goal(self,s):
      return s == self.goal_state

    def Wall(self,s):
      return self.reward(s) == None

    def Out(self,s):

        i,j = s

        if i < 0 or i >= self.h:
          return True
        elif j < 0 or j >= self.w:
          return True
        else:
          return False

    def next_state(self, s, a):
        act = [(-1, 0), (1, 0), (0, -1), (0, 1)]
        m = act[a]
        s_ = (s[0] + m[0], s[1] + m[1])

        if self.Out(s_):
            s_ = s
        elif self.Wall(s_):
            s_ = s

        return s_

    def reward(self, s):
        return self.map[s]

    def reset(self):
        self.agent_state = self.start_state
        return self.agent_state

    def step(self, a):
        s = self.agent_state
        s_ = self.next_state(s, a)
        r = self.reward(s_)
        done = self.Goal(s_)

        self.agent_state = s_
        return s_, r, done

    def get_map(self):

        m = np.zeros_like(self.map)

        for i in range(m.shape[0]):

          for j in range(m.shape[1]):

            n = self.map[i,j]

            if n == 1:
              c = '〇'
            elif n == -1:
              c = '×'
            elif n == None:
              c = '■'
            else:
              c = '□'

            m[i,j] = c

        return m

def greedy_probs(Q, s, eps, size=4):
    qs = [Q[(s, a)] for a in range(size)]
    max_a = np.argmax(qs)

    bp = eps / size
    pdf = {a: bp for a in range(size)}
    pdf[max_a] += (1 - eps)
    return pdf


class McAgent:
    def __init__(self):

        pdf = {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25}
        self.pi = defaultdict(lambda: pdf)
        self.Q = defaultdict(lambda: 0)
        self.memory = []

    def get_action(self, state):
        pdf = self.pi[state]
        a = list(pdf.keys())
        p = list(pdf.values())
        return np.random.choice(a, p=p)

    def add(self, s, a, r):
        self.memory += [(s,a,r)]

    def reset(self):
        self.memory.clear()

    def update(self):
        G = 0
        for s,a,r in reversed(self.memory):
            G = 0.9 * G + r
            k = (s, a)
            self.Q[k] += (G - self.Q[k]) * 0.1
            self.pi[s] = greedy_probs(self.Q, s, 0.1)

def disp_map(map):
  for r in map:
    s = ''
    for c in r:
      s += c
    print(s)

env = GridWorld()
agent = McAgent()

for _ in range(100):

    s = env.reset()
    agent.reset()

    while True:
        a = agent.get_action(s)
        s_, r, done = env.step(a)

        agent.add(s, a, r)
        if done:
            agent.update()
            break

        s = s_
s = (2,0)
m = env.get_map()

size = 4
direct = {0:'↑',1:'↓',2:'←',3:'→'}
while not env.Goal(s):
  qs = [agent.Q[(s, a)] for a in range(size)]
  a = np.argmax(qs)
  d = direct[a]
  i,j = s
  m[i,j] = d
  s = env.next_state(s,a)

disp_map(m)

□□→〇
□■↑×
→→↑□
