# Aprendizagem por Reforço com Q-Learning

### Código adaptado de: https://www.learndatasci.com/tutorials/reinforcement-q-learning-scratch-python-openai-gym/

## Entendimento do ambiente

In [2]:
!pip install cmake 'gym[atari]' scipy

Collecting cmake
  Downloading cmake-3.23.3-py2.py3-none-macosx_10_10_universal2.macosx_10_10_x86_64.macosx_11_0_arm64.macosx_11_0_universal2.whl (76.2 MB)
[K     |████████████████████████████████| 76.2 MB 102.3 MB/s eta 0:00:01
[?25hCollecting gym[atari]
  Downloading gym-0.25.1.tar.gz (732 kB)
[K     |████████████████████████████████| 732 kB 2.0 MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Collecting gym-notices>=0.0.4
  Downloading gym_notices-0.0.7-py3-none-any.whl (2.7 kB)
Collecting ale-py~=0.7.5
  Downloading ale_py-0.7.5-cp39-cp39-macosx_10_15_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 5.9 MB/s eta 0:00:01
[?25hCollecting importlib-resources
  Downloading importlib_resources-5.9.0-py3-none-any.whl (33 kB)
Building wheels for collected packages: gym
  Building wheel for gym (PEP 517) ... [?25ldone
[?25h  Created wh

In [9]:
import gym
import random

In [10]:
env = gym.make('Taxi-v3').env
new_step_api=True

In [12]:
env.reset()
env.render()

In [13]:
# 0 = south 1 = north 2 = east 3 = west 4 = pickup 5 = dropoff
print(env.action_space)

Discrete(6)


In [14]:
5*5*5*4

500

In [15]:
# 4 destinos
print(env.observation_space)

Discrete(500)


In [16]:
len(env.P)

500

In [17]:
env.P[484]

{0: [(1.0, 484, -1, False)],
 1: [(1.0, 384, -1, False)],
 2: [(1.0, 484, -1, False)],
 3: [(1.0, 464, -1, False)],
 4: [(1.0, 484, -10, False)],
 5: [(1.0, 484, -10, False)]}

## Treinamento

In [19]:
random.uniform(0, 1)

0.8703849227769659

In [20]:
env.action_space

Discrete(6)

In [21]:
import numpy as np
q_table = np.zeros([env.observation_space.n, env.action_space.n])
q_table.shape

(500, 6)

In [22]:
q_table

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [23]:
np.argmax(np.array([3, 5]))

1

In [25]:
# 1-10% 3-90%
# exploration / exploitation
# 0 = south 1 = north 2 = east 3 = west 4 = pickup 5 = dropoff

#%%time

from IPython.display import clear_output

alpha = 0.1
gamma = 0.6
epsilon = 0.1

for i in range(100000):
  estado = env.reset()

  penalidades, recompensa = 0, 0
  done = False
  while not done:
    # Exploração
    if random.uniform(0, 1) < epsilon:
      acao = env.action_space.sample()
    # Exploitation
    else:
      acao = np.argmax(q_table[estado])

    proximo_estado, recompensa, done, info = env.step(acao)

    q_antigo = q_table[estado, acao]
    proximo_maximo = np.max(q_table[proximo_estado])

    q_novo = (1 - alpha) * q_antigo + alpha * (recompensa + gamma * proximo_maximo)
    q_table[estado, acao] = q_novo

    if recompensa == -10:
      penalidades += 1

    estado = proximo_estado

  if i % 100 == 0:
    clear_output(wait=True)
    print('Episódio: ', i)

print('Treinamento concluído')

Episódio:  99900
Treinamento concluído


In [26]:
# 0 = south 1 = north 2 = east 3 = west 4 = pickup 5 = dropoff
q_table[346]

array([ -2.49215326,  -2.48236806,  -2.48727486,  -2.4926773 ,
       -10.78175204, -11.11991141])

In [27]:
env.reset()
env.render()

In [28]:
env.step(1)
env.render()

In [29]:
env.step(1)
env.render()

In [30]:
env.encode(3, 2, 1, 2)

346

## Avaliação

In [31]:
total_penalidades = 0
episodios = 50
frames = []

for _ in range(episodios):
  estado = env.reset()
  penalidades, recompensa = 0, 0
  done = False
  while not done:
    acao = np.argmax(q_table[estado])
    estado, recompensa, done, info = env.step(acao)

    if recompensa == -10:
      penalidades += 1
    
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': estado,
        'action': acao,
        'reward': recompensa
    })

  total_penalidades += penalidades

print('Episódios', episodios)
print('Penalidades', total_penalidades)

Episódios 50
Penalidades 0


See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


In [32]:
frames[0]

{'frame': '+---------+\n|\x1b[34;1mR\x1b[0m: | : :\x1b[35mG\x1b[0m|\n| : | : :\x1b[43m \x1b[0m|\n| : : : : |\n| | : | : |\n|Y| : |B: |\n+---------+\n  (South)\n',
 'state': 181,
 'action': 0,
 'reward': -1}

In [33]:
from time import sleep
for frame in frames:
  clear_output(wait=True)
  print(frame['frame'])
  print('Estado', frame['state'])
  print('Ação', frame['action'])
  print('Recompensa', frame['reward'])
  sleep(.5)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)

Estado 475
Ação 5
Recompensa 20
