In [1]:
import numpy as np
import random

In [11]:
# from typing import MutableMapping
class Env:
  def __init__(self, action_size=4, map_size=4, max_steps=99):
    self.action_size = action_size
    self.map_size = 4
    self.state_size=map_size^2   # state size 는 맵 크기의 제곱     
    self.max_steps=max_steps
    self.map=np.array(
            [['s','f','f','f'],
             ['f','h','f','h'],
             ['f','f','f','h'],
             ['h','f','f','g']]
    )
    self.goal=(3,3)    # goal지점 좌표
    self.stepNum=0
    self.reset()
    self.lastAction='None'        
    

  # Qtable 반환하기  
  def getQtable(self):
    return(np.zeros((self.state_size, self.action_size)))

  def reset(self):
    self.x=0
    self.y=0
    self.stepNum=0
    self.gameSet=False
    return self.getState()

  def getState(self):
    return self.x+self.y*self.action_size

  # status 출력
  def render(self):
    aMap=self.map.copy()
    s=aMap[self.y][self.x]   # s는 status의 좌표 (0,0)을 의미

    aMap[self.y][self.x]=s.upper()   # 위치를 대문자로 표시

    print('# Step = ',self.stepNum,', Last Action = ',self.lastAction)   # 움직인 횟수와 움직인 방향(액션) 출력
    for r in range(int(self.map_size)):
      str = ''
      for c in range(self.map_size):
        str+=aMap[r][c] + " "
      print(str)
    print('----')

  # 이동
  def step(self, action):
    if self.gameSet:    
      pass
    
    else:
      self.stepNum+=1    # 이동 횟수에 1 더하기
      
      if action == 0:
        if self.x > 0:
          self.x-=1    # 왼쪽으로 이동하는 거니까 x좌표를 -1 하는 것
          self.lastAction='(Left)'
      
      elif action == 1:
        if self.y + 1 < self.map_size:
          self.y+=1
          self.lastAction='(Down)'

      elif action == 2:
        if self.x+1 < self.map_size:
          self.x+=1    
          self.lastAction='(Right)'
      
      elif action == 3:
        if self.y > 0:
          self.y-=1
          self.lastAction='(Up)'

      else :
        print('---error---')

      new_state=self.getState()
      reward = 1 if self.goal == (self.x, self.y) else 0    # 골 위치에 있다면 리워드 1을 반환 아니면 0
      terminated = 1 if self.map[self.y][self.x] == 'h' or self.gameSet else 0 # h(홀)위치에 있다면 terminated = 1을 반환
      truncated = 1 if self.map[self.y][self.x] == 'g' or self.stepNum > self.max_steps or self.gameSet else 0
      self.gameSet = 1 if terminated or truncated else 0

      info = ''
      self.new_state = new_state
      self.reward = reward
      self.terminated=terminated
      self.truncated=truncated
      self.info=info
    return(self.new_state, self.reward, self.terminated, self.truncated, self.info)

  def action_space_sample(self):
    return int(random.uniform(0, self.action_size))         

In [12]:
# test
env = Env()

env.render()
print(env.step(0))
print(env.render())
print(env.step(1))
print(env.render())
print(env.step(2))
print(env.render())
print(env.step(2))
print(env.render())
print(env.step(2))

# Step =  0 , Last Action =  None
S f f f 
f h f h 
f f f h 
h f f g 
----
(0, 0, 0, 0, '')
# Step =  1 , Last Action =  None
S f f f 
f h f h 
f f f h 
h f f g 
----
None
(4, 0, 0, 0, '')
# Step =  2 , Last Action =  (Down)
s f f f 
F h f h 
f f f h 
h f f g 
----
None
(5, 0, 1, 0, '')
# Step =  3 , Last Action =  (Right)
s f f f 
f H f h 
f f f h 
h f f g 
----
None
(5, 0, 1, 0, '')
# Step =  3 , Last Action =  (Right)
s f f f 
f H f h 
f f f h 
h f f g 
----
None
(5, 0, 1, 0, '')


# Step 1: Create the environment

In [13]:
env = Env()

# Step 2: Create the Q-table and initialize it

In [14]:
action_size = 4
state_size = 16
env=Env(action_size, state_size)
qtable = env.getQtable()     #Qtable 만들기  

# Step 3: Create the hyperparameters

In [15]:
total_episodes = 20000
learning_rate = 0.7
max_steps = 99
gamma = 0.95

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005

# Step 4: The Q learning algorithm

In [16]:
# the list of rewards
rewards = []

## 2 For life or until learning is stopped

In [8]:
# 학습 진행
for episode in range(total_episodes):
  if episode%1000 == 0:
    print('episode=',episode)
  # Reset the environment
  state = env.reset()

  step = 0
  done = False
  total_rewards = 0

  for step in range(max_steps):
    
    # 우선 숫자를 랜덤하게 정한다
    exp_exp_tradeoff = random.uniform(0,1)

    # 만약 이 랜덤한 수가 epsilon보다 클 때 현 상태에서 가장 큰 q값을 찾아 action을 정한다
    if exp_exp_tradeoff > epsilon:
      action = np.argmax(qtable[state,:])

    # 그 외의 경우에는 랜덤하게 action을 선택한다
    else:
      action = env.action_space_sample()

    # action을 취하고 결과와 보상을 관찰한다
    new_state, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated

    qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])

    total_rewards += reward

    # 현 상태는 새로운 상태
    state = new_state

    # 죽으면 episode 끝내기
    if done == True:
      break

  epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
  rewards.append(total_rewards)   # 총 보상을 rewards list에 추가하기

print("Score over time: " + str(sum(rewards)/total_episodes))


episode= 0
episode= 1000
episode= 2000
episode= 3000
episode= 4000
episode= 5000
episode= 6000
episode= 7000
episode= 8000
episode= 9000
episode= 10000
episode= 11000
episode= 12000
episode= 13000
episode= 14000
episode= 15000
episode= 16000
episode= 17000
episode= 18000
episode= 19000
Score over time: 0.0


## Qtable Show

In [9]:
# 호출하는 번호에 따라 이동 방향 출력
def getAction(actionNum):
  actionMap = ['<','V','>','^']
  return actionMap[actionNum]

def getActionByProb(actions,isPrintByNum=False):
  actionNum = np.argmax(actions)     # qtable에서 가장 높은 값의 인덱스 받아옴
  if isPrintByNum : 
    return actionNum
  else :
    return getAction(actionNum)

for i in range(0,16,4):
  print(getActionByProb(qtable[i]),getActionByProb(qtable[i+1]),getActionByProb(qtable[i+2]),getActionByProb(qtable[i+3]))

print(qtable)

< < < <
< < < <
< < < <
< < < <
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


# Step 5: Use our Q-table to play FrozenLake!

In [10]:
env.reset()

for episode in range(1):
  state = env.reset()
  step = 0
  done = False
  print("***************************************************************")
  print("EPISODE", episode)

for step in range(max_steps):
  # 가장 reward가 가장 큰 값을 가진 인덱스의 action 실행
  state=state
  action = np.argmax(qtable[state,:])

  new_state, reward, terminated, truncated, info = env.step(action)
  done = terminated or truncated

  print('Step',step,'---------------------')

  print(env.render())
  if done:
    print(env.render())
    if new_state == 15:
      print("We reached our Goal 🏆")

    else:
      print("We fell into a hole ☠️")

    print("Number of steps", step)

    break
  state = new_state

***************************************************************
EPISODE 0
Step 0 ---------------------
# Step =  1 , Last Action =  (Left)
S f f f 
f h f h 
f f f h 
h f f g 
----
None
Step 1 ---------------------
# Step =  2 , Last Action =  (Left)
S f f f 
f h f h 
f f f h 
h f f g 
----
None
Step 2 ---------------------
# Step =  3 , Last Action =  (Left)
S f f f 
f h f h 
f f f h 
h f f g 
----
None
Step 3 ---------------------
# Step =  4 , Last Action =  (Left)
S f f f 
f h f h 
f f f h 
h f f g 
----
None
Step 4 ---------------------
# Step =  5 , Last Action =  (Left)
S f f f 
f h f h 
f f f h 
h f f g 
----
None
Step 5 ---------------------
# Step =  6 , Last Action =  (Left)
S f f f 
f h f h 
f f f h 
h f f g 
----
None
Step 6 ---------------------
# Step =  7 , Last Action =  (Left)
S f f f 
f h f h 
f f f h 
h f f g 
----
None
Step 7 ---------------------
# Step =  8 , Last Action =  (Left)
S f f f 
f h f h 
f f f h 
h f f g 
----
None
Step 8 ---------------------
# Step = 