https://www.learndatasci.com/tutorials/reinforcement-q-learning-scratch-python-openai-gym/

```
Буквы - возможные остановки
Фиолетовая буква - пассажир
Синяя буква - куда пассажира доставить
| - стена
: - проезд
```

In [3]:
import gym

env = gym.make("Taxi-v3").env

env.render()

+---------+
|R: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+



In [4]:
env.reset() # reset environment to a new, random state
env.render()

### Возможные actions
# 0 = south
# 1 = north
# 2 = east
# 3 = west
# 4 = pickup
# 5 = dropoff
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35m[43mY[0m[0m| : |B: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


In [6]:
### Восстановление состояния
state = env.encode(3, 1, 2, 0) # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s = state
env.render()

State: 328
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+



In [7]:
### Reward table for state 328
env.P[328]
# - 0: move south ↓
# - 1: move north ↑
# - 2: move east →
# - 3: move west ←
# - 4: pickup passenger
# - 5: dropoff passenger

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [34]:
### Solve without RL
env.s = 328  # set environment to illustration's state

epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 2386
Penalties incurred: 744


In [35]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames, timeout):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(timeout)
        
print_frames(frames, .02)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 2386
State: 0
Action: 5
Reward: 20


### Q-learning
```
Q(state,action)←(1−α)Q(state,action)+α(reward+γmaxaQ(next state,all actions))

α (0≤γ≤1) - learning rate
γ (0≤γ≤1) - важность будущих наград. Чем больше значение, тем долгосрочнее решение
```
#### Q-table
```
state, actions[0], actions[1], ..., actions[5]
0|...
1|...
```

In [148]:
"""Training the agent"""

import random
from IPython.display import clear_output
import numpy as np

class QLearning():
    def __init__(self, alpha = 0.1, gamma = 0.6, epsilon = 0.1, n_train_epochs=100000):
        self._alpha = alpha
        self._gamma = gamma
        self._epsilon = epsilon
        self._n_train_epochs = n_train_epochs
    
    def fit(self, env):
        self.q_table = np.zeros([env.observation_space.n, env.action_space.n])
        
        # For plotting metrics
        all_epochs = []
        all_penalties = []

        for i in range(1, self._n_train_epochs + 1):
            state = env.reset()

            epochs, penalties, reward, = 0, 0, 0
            done = False

            while not done:
                if random.uniform(0, 1) < self._epsilon:
                    action = env.action_space.sample() # Explore action space
                else:
                    action = np.argmax(self.q_table[state]) # Exploit learned values

                next_state, reward, done, info = env.step(action) 

                old_value = self.q_table[state, action]
                next_max = np.max(self.q_table[next_state])

                new_value = (1 - self._alpha) * old_value + self._alpha * (reward + self._gamma * next_max)
                self.q_table[state, action] = new_value

                if reward == -10:
                    penalties += 1

                state = next_state
                epochs += 1

            all_epochs.append(epochs)
            all_penalties.append(penalties)
            if i % 100 == 0:
                clear_output(wait=True)
                print(f"Episode: {i}")
                print(f"Mean epochs: {np.mean(epochs)}")
                print(f"Mean penalties: {np.mean(penalties)}")

        self.all_epochs = all_epochs
        self.all_penalties = all_penalties
        print("Training finished.\n")
    
    def predict(self, state):
        return np.argmax(self.q_table[state])

In [149]:
alg = QLearning(alpha = 0.1, gamma = 0.6, epsilon = 0.1, n_train_epochs=100000)
alg.fit(env)

Episode: 100000
Mean epochs: 21.0
Mean penalties: 1.0
Training finished.



In [140]:
(alg.q_table == 0).sum() / (alg.q_table.shape[0] * alg.q_table.shape[1])

0.2

In [None]:
import matplotlib.pyplot as plt
import numpy as np

step = 1000
all_epochs = alg.all_epochs
all_penalties = alg.all_penalties
x = np.arange(0, len(all_epochs))
new_x = np.arange(0, len(all_epochs), step)
downsample_epochs = np.interp(new_x, x, all_epochs)
downsample_penalties = np.interp(new_x, x, all_penalties)

plt.figure(figsize=(9, 3))
plt.subplot(121)
plt.plot(new_x, downsample_epochs)
plt.ylabel('Mean epochs')
plt.xlabel('Epochs')

plt.subplot(122)
plt.plot(new_x, downsample_penalties)
plt.ylabel('Mean penalties')
plt.xlabel('Epochs')

plt.show()

In [122]:
alg.q_table[328]

array([ -2.38184385,  -2.27325184,  -2.3940772 ,  -2.35613418,
       -10.20135252, -11.1688543 ])

In [132]:
"""Solve with Q-learning"""
env.s = 328  # set environment to illustration's state

epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

while not done:
    action = alg.predict(state)
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 12
Penalties incurred: 0


In [142]:
print_frames(frames, .4)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 12
State: 0
Action: 5
Reward: 20
