# Q-Learning
https://www.learndatasci.com/tutorials/reinforcement-q-learning-scratch-python-openai-gym/

In [16]:
import gym
import time
from IPython.display import clear_output

env = gym.make("Taxi-v3",render_mode='ansi')    

In [17]:
env.reset()
print(env.render())

+---------+
|[35mR[0m: | : :[34;1mG[0m|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+




In [18]:
env.reset() # reset environment to a new, random state
print(env.render())

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))


+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+


Action Space Discrete(6)
State Space Discrete(500)


In [19]:
env.P[123]   # Reward Table

{0: [(1.0, 223, -1, False)],
 1: [(1.0, 23, -1, False)],
 2: [(1.0, 123, -1, False)],
 3: [(1.0, 103, -1, False)],
 4: [(1.0, 123, -10, False)],
 5: [(1.0, 123, -10, False)]}

# Solution without RL Algorithm
Taking random actions from each state

## Action Space

0. south
1. north
2. east
3. west
4. pickup
5. dropoff

In [20]:
action = env.action_space.sample()
print(action)

1


In [21]:
state, reward, done, info, actionMask = env.step(action)
env.step(action)

(92,
 -1,
 False,
 False,
 {'prob': 1.0, 'action_mask': array([1, 0, 0, 1, 0, 0], dtype=int8)})

In [22]:
epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info, actionMask = env.step(action)

    if reward == -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(),
        'episode': '0',
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 1273
Penalties incurred: 403


## Printing frames

In [23]:
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Episode: {frame['episode']}")
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        time.sleep(1)

In [24]:
print_frames(frames[1:5])

+---------+
|[35mR[0m: | :[43m [0m:G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (West)

Episode: 0
Timestep: 4
State: 72
Action: 3
Reward: -1


# Reinforcement Learning using Q-Learning

In [25]:
import numpy as np
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [26]:
# Get Initial State
state = env.reset()
print(state)

# Get Exploration or Exploitation Action
action = env.action_space.sample()
print(action)
# Perform Step
next_state, reward, done, info, actionMask = env.step(action) 
print(env.step(action))



(231, {'prob': 1.0, 'action_mask': array([1, 1, 1, 1, 0, 0], dtype=int8)})
0
(431, -1, False, False, {'prob': 1.0, 'action_mask': array([0, 1, 1, 0, 0, 0], dtype=int8)})


In [27]:
q_table[123]

array([0., 0., 0., 0., 0., 0.])

In [63]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1 # 10% exploration

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, 100001):
    state = env.reset()
    state = state[0]

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info, actionMask = env.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 100000
Training finished.

CPU times: user 39.3 s, sys: 9.07 s, total: 48.4 s
Wall time: 38.2 s


## Reward Table
{action: [(probability, nextstate, reward, done)]}

## Q Table
Rows = Number of States
Cols = Number of Actions

q_table[328] = array([ -2.4116061 ,  -2.27325184,  -2.40676554,  -2.35447935,
       -11.09694326, -10.59867765])

In [65]:
q_table[328]

array([ -2.4116061 ,  -2.27325184,  -2.40676554,  -2.35447935,
       -11.09694326, -10.59867765])

# Evaluate agent's performance after Q-learning

In [69]:
total_epochs, total_penalties = 0, 0
episodes = 100
frames = []

for ep in range(episodes):
    state = env.reset()
    state = state[0]
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info, attentionMask = env.step(action)

        if reward == -10:
            penalties += 1
        
        # Put each rendered frame into dict for animation
        frames.append({
            'frame': env.render(),
            'episode': ep, 
            'state': state,
            'action': action,
            'reward': reward
            }
        )
        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 12.84
Average penalties per episode: 0.0


# Visualization

In [72]:
print_frames(frames[:10])

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[42mY[0m[0m| : |B: |
+---------+
  (South)

Episode: 0
Timestep: 10
State: 418
Action: 0
Reward: -1
