In [1]:
import gym
import numpy as np
np.set_printoptions(threshold=np.inf)
import matplotlib.pylab as plt
import random

In [2]:
env = gym.make("Taxi-v2")

In [3]:
env.action_space.n

6

In [4]:
env.action_space.sample()

4

In [5]:
env.observation_space

Discrete(500)

In [6]:
env.reset()
env.render()

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m:[43m [0m|
+---------+



In [7]:
print(env.step(5))
env.render('human')

(493, -10, False, {'prob': 1.0})
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m:[43m [0m|
+---------+
  (Dropoff)


In [8]:
env.step(4)

(493, -10, False, {'prob': 1.0})

In [9]:
env.render()

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m:[43m [0m|
+---------+
  (Pickup)


In [10]:
class QLearner:

    def __init__(self, env):
        self.env = env
        self.Q = -np.ones((env.observation_space.n, env.action_space.n))
        self.possible_actions = list(range(env.action_space.n))
        self.epsilon = 0.1
        self.alpha = 0.2
        self.gamma = 0.8
        
    def calcRandomAction(self):
        return env.action_space.sample()
    
    def calcBestAction(self, state):
        return np.argmax(self.Q[state])
    
    def calcMaxQ(self, state):
        return np.max(self.Q[state])    
        
    def calcNextAction(self, state):
        if np.max(self.Q[state]) == np.min(self.Q[state]):
            return self.calcRandomAction()
        #if np.max(self.Q[state]) < -0.5:
        #    return self.calcRandomAction()   
        if random.random() < self.epsilon:
            return self.calcRandomAction()
        else:
            return self.calcBestAction(state)
        
    def update(self, state, action, nextState, reward):
        #if reward > 0:
        #    print("update", state, action, nextState, reward)
        oldQ = self.Q[state, action]
        maxQ = self.calcMaxQ(state)
        updatedQ = oldQ + self.alpha * (reward + self.gamma * maxQ - oldQ)
        #if updatedQ > 0:
        #    print("Hallilujah")
        self.Q[state, action] = updatedQ
        
    def cutRepetitives(self, states, actions, rewards):
        lastState = states[-1]
        history = list(zip(states[:-1], actions, rewards))
        lastOcc = {}
        for i in range(len(history)):
            lastOcc[history[i][0]] = i
        i = 0
        nhistory = []
        while i < len(history):
            if i < lastOcc[history[i][0]]:
                i = lastOcc[history[i][0]]
            nhistory.append(history[i])
            i += 1
        states, actions, rewards = map(list, zip(*nhistory))
        states.append(lastState)
        return states, actions, rewards
        
        
        
    def play(self, maxSteps=None):
        env = self.env
        state = env.reset()
        states = [state]
        actions = []
        rewards = []
        #for i in range(maxSteps):
        while(True):
            action = self.calcNextAction(states[-1])
            actions.append(action)
            state, reward, isDone, info = env.step(action)
            rewards.append(reward)
            states.append(state)
            if isDone:
                self.Q[state] = np.ones_like(self.Q[state]) * reward
                break
                
        self.cutRepetitives(states, actions, rewards)
                
        if rewards[-1] > 0: 
            rew = rewards[-1]
            for i in reversed(range(0, len(states) - 1)):
                rew = rew * 0.9 + rewards[i] * 0.1
                self.update(states[i], actions[i], states[i + 1], rew)
        #else:
            #for i in reversed(range(0, len(states) - 1)):
            #    self.update(states[i], actions[i], states[i + 1], rewards[i] * 0.3)
        #if rewards[-1] > 0:
        #    print(rewards)
        return rewards[-1]

In [11]:
ql = QLearner(env)

In [12]:
rewsum = 0.0
rewc = 0
while True:
    #random.seed(0)
    rew = ql.play()
    rewsum += rew
    rewc += 1
    if random.random() < 0.001:
        ql.epsilon *= 0.98
        print((ql.Q > 0).sum(), rew, rewsum / rewc)

148 -10 -1.8502673796791445
186 -10 -1.4886293792255685
190 -10 -1.3455497382198953
253 20 -0.8204633204633205
257 -10 -0.7774647887323943
296 -10 -0.2523006134969325
751 20 4.389147564469914
949 20 6.813619061046102
1005 20 7.534623879433545
954 20 7.710216483099127
874 20 7.955483422211917
898 20 8.438461538461539
994 -1 9.14783331793403
1014 20 9.228994459078935
1041 -1 9.447460595446586
1107 20 9.886483371748437
1173 -1 11.195682872705266
1146 -1 11.248416468725257
1364 20 11.872783021703842
1298 20 12.028196147110332
1337 20 12.218400542189089
1381 20 12.339982220246695
1389 20 12.429461942257218
1350 -1 12.586484909563465
1325 20 12.767477203647417
1332 -1 12.876448450034664
1377 20 13.325841172214183
1435 20 13.696872493985566
1442 20 13.703514834794335
1229 20 13.706381409074142
1173 20 13.67717772139664
1193 -10 13.70071827613727
1196 20 13.725763087142179
1300 20 13.81877962991283
1333 20 13.90840122580163
1397 20 14.026765146806099
1444 20 14.171764286985935
1472 20 14.24810

KeyboardInterrupt: 

In [13]:
n = 1000
success = 0
for i in range(1000):
    if ql.play() == 20:
        success += 1
print(success / n)

1.0


В чём соль решения и почему всё так хорошо вышло:
В процессе обучения снижается коэффициент рандома и отбрасываются неуспешные попытки. Поэтому мы можем заметить, что уже после 1000 прохождений был выбран оптимальный путь, по которому игра и "игралась", что приводило к постоянному увеличению процента успеха. Уже обученный алгоритм без проблем прошёл 1000 игр по одному и тому же пути. 