<a href="https://colab.research.google.com/github/4nands/Perceptron/blob/main/QLearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
from collections import defaultdict,deque

class GridWorld:

  def __init__(self):

    self.actions = [0,1,2,3]
    self.map = np.array([
        [0,0,0,1],
        [0,None,0,-1],
        [0,0,0,0]
    ])
    self.h = self.map.shape[0]
    self.w = self.map.shape[1]
    self.states = [(i,j) for i in range(self.h) for j in range(self.w)]
    self.goal = (0,3)
    self.start = (2,0)
    self.current = self.start

  def reward(self,s):
    return self.map[s]

  def isOut(self,s):

    i,j = s

    if i < 0 or i >= self.h:
      return True
    elif j < 0  or j >= self.w:
      return True

    return False

  def isGoal(self,s):
    return s == self.goal

  def isWall(self,s):
    return self.reward(s) is None

  def next(self,s,a):

    move = {0:(-1,0),1:(1,0),2:(0,-1),3:(0,1)}
    s1,s2 = s
    m1,m2 = move[a]

    s_ = (s1+m1,s2+m2)

    if self.isOut(s_):
      s_ = s
    elif self.isWall(s_):
      s_ = s

    return s_

  def reset(self):
    self.current = self.start

    return self.current

  def step(self,a):

    s = self.current
    s_ = self.next(s,a)
    r = self.reward(s_)

    self.current = s_

    return s_,r,self.isGoal(s_)

def get_map(env):

  map = np.zeros_like(env.map)

  for i in range(env.h):

    for j in range(env.w):

      a = env.map[i,j]

      if a == -1:
        map[i,j] = '*'
      elif a == 1:
        map[i,j] = '1'
      elif a is None:
        map[i,j] = ' '
      else:
        map[i,j] = '0'

  return map

def disp_map(map):

  for r in map:

    s = ''

    for c in r:

      s += c

    print(s)


class QLearn:

  def __init__(self):

    self.action_size = 4
    self.Q = defaultdict(lambda:0)

  def get_qs(self,s):

    return np.array([self.Q[s,a] for a in range(self.action_size)])

  def get_action(self,s):

    if np.random.rand() < 0.1:
      return np.random.choice(self.action_size)

    return self.action(s)

  def action(self,s):

    qs = self.get_qs(s)

    return np.argmax(qs)

  def reset(self):
    self.memory.clear()

  def memo(self,s,a,r):
    self.memory += [(s,a,r)]

  def update(self,s,a,r,s_,done):


    if done:
      q_ = 0
    else:
      qs = self.get_qs(s_)
      q_ = np.max(qs)

    target = r + 0.9*q_

    self.Q[s,a] += (target-self.Q[s,a]) * 0.8


env = GridWorld()
agent = QLearn()


for _ in range(10000):

  s = env.reset()

  while True:

    a = agent.get_action(s)
    s_,r,done = env.step(a)
    agent.update(s,a,r,s_,done)

    if done:
      break

    s = s_

map=get_map(env)

s = (2,0)

for _ in range(20):

  map[s] = '1'
  a = agent.action(s)
  s_ = env.next(s,a)

  if env.isGoal(s_):
    break
  s = s_

disp_map(map)


1111
1 0*
1000
