In [15]:
import numpy as np
import gym
import time
import random

In [16]:
state_size = 0
action_size = 0
epsilon = 0.2

In [17]:
Q = np.zeros((state_size, action_size))

Here are the 3 basic steps:
1. Agent starts in a state (<b>s1</b>) takes an action (<b>a1</b>) and receives a reward (<b>r1</b>).
2. Agent selects action by referencing <b>Q-table</b> with highest value (<b>max</b>) OR by random (<b>epsilon</b>, <b>ε</b>)
3. Update <b>q-values</b>

<b>Learning Rate</b>: <b>lr or learning rate</b>, often referred to as alpha or α, can simply be defined as how much you accept the new value vs the old value. Above we are taking the difference between new and old and then multiplying that value by the learning rate. This value then gets added to our previous q-value which essentially moves it in the direction of our latest update.

<b>Gamma</b>: <b>gamma</b> or <b>γ</b> is a discount factor. It’s used to balance immediate and future reward. From our update rule above you can see that we apply the discount to the future reward. Typically this value can range anywhere from 0.8 to 0.99.

<b>Reward</b>: <b>reward</b> is the value received after completing a certain action at a given state. A reward can happen at any given time step or only at the terminal time step.

<b>Max</b>: <b>np.max()</b> uses the numpy library and is taking the maximum of the future reward and applying it to the reward for the current state. What this does is impact the current action by the possible future reward. This is the beauty of q-learning. We’re allocating future reward to current actions to help the agent select the highest return action at any given state.

In [18]:
import gym

In [19]:
env = gym.make("Taxi-v2").env

In [21]:
env.reset()
env.render()

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |B: |
+---------+



In [22]:
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

Action Space Discrete(6)
State Space Discrete(500)


In [23]:
state = env.encode(3, 2, 2, 0)
print("State:", state)

env.s = state
env.render()

State: 348
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |B: |
+---------+



In [27]:
q_table = np.zeros((env.observation_space.n, env.action_space.n))

In [28]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
lr = 0.01
gamma = 0.9
epsilon = 0.4

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, 100001):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        
        # Condicional para tomar uma decisão baseada em uma ação já tomada ou em uma ação randomica.
        # Valor é definida pela variavel epsilon.
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        # Next_state, é o proximo estado de ação a ser tomada, por exemplo, estando no estado [300]
        #para o proximo [360]
        # reward, valor negativo ou positivo referente a pontuação para a ação tomada.
        # done, estado booleano para caso tenha terminado o trajeto ou não.
        next_state, reward, done, info = env.step(action) 
        
        # Q_value do estado anterior.
        old_value = q_table[state, action]
        
        # Q_value do maior valor para o proximo estado para a ação a ser tomada.
        next_max = np.max(q_table[next_state])
        
        # Calculo de atualização da Q_table
        new_value = (1 - lr) * old_value + lr * (reward + gamma * next_max)
        
        # Atualização da Q_table
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        if i % 100 == 0:
            clear_output(wait=True)
            print(f"Episode: {i}")

print("Training finished.\n")

Episode: 100000
Training finished.

CPU times: user 1min 13s, sys: 9.77 s, total: 1min 23s
Wall time: 1min 13s


In [None]:
%%time
total_epochs, total_penalties = 0, 0
episodes = 1000


for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        time.sleep(0.1)
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        clear_output(wait=True)
        env.render()

        epochs += 1
    
    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")