In [36]:
import gym
import numpy as np

#create a single game instance
env = gym.make("Taxi-v2")
#start new game
start = env.reset()
# display the game state
env.render()
n_states = env.observation_space.n
n_actions = env.action_space.n

+---------+
|R: | : :[35mG[0m|
| :[43m [0m: : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



In [44]:
print("Number of states =", n_states)
print("Number of actions =", n_actions)
print("initial observation code:", start)
print('printing observation:')
env.render()
print("observations:", env.observation_space, 'n =', env.observation_space.n)
print("actions:", env.action_space, 'n =', env.action_space.n)

Number of states = 500
Number of actions = 6
initial observation code: 133
printing observation:
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
observations: Discrete(500) n = 500
actions: Discrete(6) n = 6


In [39]:
print("taking action 2 (East)")
new_obs, reward, is_done, _ = env.step(2)
print("new observation code:", new_obs)
print("reward:", reward)
print("is game over?:", is_done)
print("printing new state:")
env.render()

taking action 2 (East)
new observation code: 173
reward: -1
is game over?: False
printing new state:
+---------+
|R: | : :[35mG[0m|
| : : :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)


In [40]:
action_to_i = {
    'South':0,
    'North':1,
    'East':2,
    'West':3,
    "Pickup":4,
    "Dropoff":5
}

In [42]:
s,r,done,_=env.step(action_to_i['South'])
env.render()

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)


In [43]:
done

False

In [45]:
def get_random_policy():
    """
    Build a numpy array representing agent policy.
    This array must have one element per each of 16 environment states.
    Element must be an integer from 0 to 3, representing action
    to take from that state.
    """
    return np.random.randint(0, n_actions, n_states)

In [46]:
np.random.seed(42)
policies = [get_random_policy() for i in range(10**4)]
assert all([len(p) == n_states for p in policies]), 'policy length should always be n_states'
assert np.min(policies) == 0, 'minimal action id should be 0'
assert np.max(policies) == n_actions-1, 'maximal action id should match n_actions-1'
action_probas = np.unique(policies, return_counts=True)[-1] /10**4. /n_states
print("Action frequencies over 10^4 samples:",action_probas)
assert np.allclose(action_probas, [1. / n_actions] * n_actions, atol=0.05), "The policies aren't uniformly random (maybe it's just an extremely bad luck)"
print("Seems fine!")

Action frequencies over 10^4 samples: [ 0.1664472  0.166829   0.1666948  0.1666484  0.1666524  0.1667282]
Seems fine!


In [47]:
def sample_reward(env, policy, t_max=100):
    """
    Interact with an environment, return sum of all rewards.
    If game doesn't end on t_max (e.g. agent walks into a wall), 
    force end the game and return whatever reward you got so far.
    Tip: see signature of env.step(...) method above.
    """
    s = env.reset()
    total_reward = 0
    
    for _ in range(t_max):
        action = policy[s]
        s, r, done, _ =  env.step(action)
        total_reward += r
        if done: 
            break
            
    return total_reward

In [49]:
print("generating 10^3 sessions...")
rewards = [sample_reward(env,get_random_policy()) for _ in range(10**3)]
assert all([type(r) in (int, float) for r in rewards]), 'sample_reward must return a single number'
print("Looks good!")

generating 10^3 sessions...
Looks good!


In [50]:
def evaluate(policy, n_times=100):
    """Run several evaluations and average the score the policy gets."""
    rewards = [sample_reward(env, policy) for _ in range(n_times)]
    return float(np.mean(rewards))

In [52]:
best_policy = None
best_score = -float('inf')

from tqdm import tqdm
for i in tqdm(range(10000)):
    policy = get_random_policy()
    score = evaluate(policy)
    if score > best_score:
        best_score = score
        best_policy = policy
        print("New best score:", score)


  0%|          | 0/10000 [00:00<?, ?it/s][A
  0%|          | 1/10000 [00:00<37:43,  4.42it/s][A

New best score: -521.02



  0%|          | 2/10000 [00:00<37:39,  4.42it/s]Exception in thread Thread-4:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.5/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

  0%|          | 23/10000 [00:04<33:11,  5.01it/s]

New best score: -511.57


  0%|          | 38/10000 [00:07<34:07,  4.87it/s]

New best score: -502.66


  1%|          | 54/10000 [00:11<33:57,  4.88it/s]

New best score: -494.11


  1%|          | 59/10000 [00:12<33:52,  4.89it/s]

New best score: -492.76


  1%|          | 106/10000 [00:21<33:31,  4.92it/s]

New best score: -492.67


  1%|▏         | 129/10000 [00:26<33:09,  4.96it/s]

New best score: -467.11


  3%|▎         | 320/10000 [01:04<32:40,  4.94it/s]

New best score: -439.3


  8%|▊         | 784/10000 [02:37<30:54,  4.97it/s]

New best score: -413.11


 12%|█▏        | 1178/10000 [03:57<29:36,  4.97it/s]

New best score: -386.2


100%|██████████| 10000/10000 [35:27<00:00,  4.70it/s]


Заметим, что простой перебор работает просто отвратительно:( Попытаемся генерировать генетические алгоритмы

In [53]:
# мутация
# бежим по состояниям и рандомно выбираем 
# между стратегиями с вероятностью p
def crossover(policy1, policy2, p=0.5):
    """
    for each state, with probability p take action from policy1, else policy2
    """
    return np.array(
        [np.random.choice([policy1[i], policy2[i]], p=[p, 1 - p]) for i in range(n_states)]
        )

In [54]:
def mutation(policy, p=0.1):
    """
    for each state, with probability p replace action with random action
    Tip: mutation can be written as crossover with random policy
    """
    return crossover(policy, get_random_policy(),p=1-p)
    

In [55]:

n_epochs = 100 #how many cycles to make
pool_size = 100 #how many policies to maintain
n_crossovers = 50 #how many crossovers to make on each step
n_mutations = 50 #how many mutations to make on each tick


In [56]:
print("initializing...")
pool = [get_random_policy() for _ in range(pool_size)]
pool_scores = [evaluate(p) for p in pool]

initializing...


In [57]:
#main loop
from random import choice
for epoch in tqdm(range(n_epochs)):
    print("Epoch %s:"%epoch)
    
    crossovered = [crossover(choice(pool), choice(pool)) for _ in range(n_crossovers)]
    mutated = [mutation(choice(pool))]
    
    assert type(crossovered) == type(mutated) == list
    
    #add new policies to the pool
    pool = pool + crossovered + mutated
    pool_scores = [evaluate(p) for p in pool]
    
    #select pool_size best policies
    selected_indices = np.argsort(pool_scores)[-pool_size:]
    pool = [pool[i] for i in selected_indices]
    pool_scores = [pool_scores[i] for i in selected_indices]

    #print the best policy so far (last in ascending score order)
    print("best score:", pool_scores[-1])
    #print_policy(pool[-1])

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 0:


  1%|          | 1/100 [00:33<54:36, 33.10s/it]

best score: -457.66
Epoch 1:


  2%|▏         | 2/100 [01:05<53:41, 32.87s/it]

best score: -457.48
Epoch 2:


  3%|▎         | 3/100 [01:36<52:12, 32.30s/it]

best score: -422.47
Epoch 3:


  4%|▍         | 4/100 [02:06<50:41, 31.68s/it]

best score: -405.01
Epoch 4:


  5%|▌         | 5/100 [02:38<50:17, 31.76s/it]

best score: -351.28
Epoch 5:


  6%|▌         | 6/100 [03:09<49:30, 31.61s/it]

best score: -395.29
Epoch 6:


  7%|▋         | 7/100 [03:41<49:07, 31.70s/it]

best score: -378.1
Epoch 7:


  8%|▊         | 8/100 [04:12<48:28, 31.61s/it]

best score: -359.2
Epoch 8:


  9%|▉         | 9/100 [04:45<48:07, 31.74s/it]

best score: -404.83
Epoch 9:


 10%|█         | 10/100 [05:19<47:55, 31.95s/it]

best score: -342.1
Epoch 10:


 11%|█         | 11/100 [05:49<47:07, 31.77s/it]

best score: -359.83
Epoch 11:


 12%|█▏        | 12/100 [06:20<46:29, 31.69s/it]

best score: -395.29
Epoch 12:


 13%|█▎        | 13/100 [06:51<45:52, 31.64s/it]

best score: -350.56
Epoch 13:


 14%|█▍        | 14/100 [07:20<45:08, 31.50s/it]

best score: -368.38
Epoch 14:


 15%|█▌        | 15/100 [07:51<44:30, 31.42s/it]

best score: -368.65
Epoch 15:


 16%|█▌        | 16/100 [08:23<44:04, 31.48s/it]

best score: -368.2
Epoch 16:


 17%|█▋        | 17/100 [08:56<43:41, 31.58s/it]

best score: -341.92
Epoch 17:


 18%|█▊        | 18/100 [09:27<43:03, 31.50s/it]

best score: -350.2
Epoch 18:


 19%|█▉        | 19/100 [09:56<42:23, 31.40s/it]

best score: -314.56
Epoch 19:


 20%|██        | 20/100 [10:26<41:44, 31.31s/it]

best score: -323.38
Epoch 20:


 21%|██        | 21/100 [10:57<41:13, 31.30s/it]

best score: -269.83
Epoch 21:


 22%|██▏       | 22/100 [11:27<40:37, 31.25s/it]

best score: -324.46
Epoch 22:


 23%|██▎       | 23/100 [11:58<40:03, 31.22s/it]

best score: -333.19
Epoch 23:


 24%|██▍       | 24/100 [12:29<39:32, 31.22s/it]

best score: -287.74
Epoch 24:


 25%|██▌       | 25/100 [12:59<38:58, 31.18s/it]

best score: -261.28
Epoch 25:


 26%|██▌       | 26/100 [13:29<38:23, 31.13s/it]

best score: -288.1
Epoch 26:


 27%|██▋       | 27/100 [13:58<37:46, 31.05s/it]

best score: -323.65
Epoch 27:


 28%|██▊       | 28/100 [14:28<37:13, 31.03s/it]

best score: -288.19
Epoch 28:


 29%|██▉       | 29/100 [14:59<36:42, 31.02s/it]

best score: -270.19
Epoch 29:


 30%|███       | 30/100 [15:29<36:09, 31.00s/it]

best score: -288.01
Epoch 30:


 31%|███       | 31/100 [16:02<35:42, 31.05s/it]

best score: -306.01
Epoch 31:


 32%|███▏      | 32/100 [16:34<35:12, 31.07s/it]

best score: -269.38
Epoch 32:


 33%|███▎      | 33/100 [17:04<34:39, 31.04s/it]

best score: -296.83
Epoch 33:


 34%|███▍      | 34/100 [17:33<34:04, 30.98s/it]

best score: -260.11
Epoch 34:


 35%|███▌      | 35/100 [18:03<33:32, 30.95s/it]

best score: -269.47
Epoch 35:


 36%|███▌      | 36/100 [18:34<33:01, 30.96s/it]

best score: -278.92
Epoch 36:


 37%|███▋      | 37/100 [19:06<32:32, 30.99s/it]

best score: -252.19
Epoch 37:


 38%|███▊      | 38/100 [19:39<32:03, 31.03s/it]

best score: -261.46
Epoch 38:


 39%|███▉      | 39/100 [20:11<31:35, 31.08s/it]

best score: -243.19
Epoch 39:


 40%|████      | 40/100 [20:43<31:05, 31.09s/it]

best score: -252.55
Epoch 40:


 41%|████      | 41/100 [21:14<30:33, 31.08s/it]

best score: -261.1
Epoch 41:


 42%|████▏     | 42/100 [21:44<30:01, 31.07s/it]

best score: -225.46
Epoch 42:


 43%|████▎     | 43/100 [22:15<29:30, 31.07s/it]

best score: -225.55
Epoch 43:


 44%|████▍     | 44/100 [22:45<28:58, 31.04s/it]

best score: -225.28
Epoch 44:


 45%|████▌     | 45/100 [23:15<28:25, 31.01s/it]

best score: -198.73
Epoch 45:


 46%|████▌     | 46/100 [23:45<27:53, 30.99s/it]

best score: -207.73
Epoch 46:


 47%|████▋     | 47/100 [24:16<27:22, 30.98s/it]

best score: -198.37
Epoch 47:


 48%|████▊     | 48/100 [24:47<26:51, 30.99s/it]

best score: -207.37
Epoch 48:


 49%|████▉     | 49/100 [25:19<26:22, 31.02s/it]

best score: -162.01
Epoch 49:


 50%|█████     | 50/100 [25:50<25:50, 31.01s/it]

best score: -207.28
Epoch 50:


 51%|█████     | 51/100 [26:22<25:20, 31.03s/it]

best score: -180.73
Epoch 51:


 52%|█████▏    | 52/100 [26:54<24:50, 31.06s/it]

best score: -162.73
Epoch 52:


 53%|█████▎    | 53/100 [27:25<24:19, 31.05s/it]

best score: -179.92
Epoch 53:


 54%|█████▍    | 54/100 [27:55<23:47, 31.02s/it]

best score: -162.73
Epoch 54:


 55%|█████▌    | 55/100 [28:26<23:15, 31.02s/it]

best score: -162.55
Epoch 55:


 56%|█████▌    | 56/100 [28:56<22:44, 31.00s/it]

best score: -162.64
Epoch 56:


 57%|█████▋    | 57/100 [29:26<22:12, 30.99s/it]

best score: -162.82
Epoch 57:


 58%|█████▊    | 58/100 [29:56<21:41, 30.98s/it]

best score: -162.82
Epoch 58:


 59%|█████▉    | 59/100 [30:30<21:11, 31.02s/it]

best score: -144.73
Epoch 59:


 60%|██████    | 60/100 [31:02<20:41, 31.03s/it]

best score: -162.64
Epoch 60:


 61%|██████    | 61/100 [31:33<20:10, 31.04s/it]

best score: -162.55
Epoch 61:


 62%|██████▏   | 62/100 [32:04<19:39, 31.05s/it]

best score: -153.64
Epoch 62:


 63%|██████▎   | 63/100 [32:34<19:07, 31.02s/it]

best score: -135.73
Epoch 63:


 64%|██████▍   | 64/100 [33:04<18:36, 31.00s/it]

best score: -135.91
Epoch 64:


 65%|██████▌   | 65/100 [33:35<18:05, 31.01s/it]

best score: -144.82
Epoch 65:


 66%|██████▌   | 66/100 [34:06<17:34, 31.01s/it]

best score: -144.64
Epoch 66:


 67%|██████▋   | 67/100 [34:36<17:02, 31.00s/it]

best score: -126.64
Epoch 67:


 68%|██████▊   | 68/100 [35:06<16:31, 30.97s/it]

best score: -145.0
Epoch 68:


 69%|██████▉   | 69/100 [35:36<16:00, 30.97s/it]

best score: -127.0
Epoch 69:


 70%|███████   | 70/100 [36:08<15:29, 30.98s/it]

best score: -135.91
Epoch 70:


 71%|███████   | 71/100 [36:40<14:58, 31.00s/it]

best score: -144.73
Epoch 71:


 72%|███████▏  | 72/100 [37:11<14:27, 30.99s/it]

best score: -126.73
Epoch 72:


 73%|███████▎  | 73/100 [37:41<13:56, 30.98s/it]

best score: -127.0
Epoch 73:


 74%|███████▍  | 74/100 [38:10<13:24, 30.95s/it]

best score: -127.0
Epoch 74:


 75%|███████▌  | 75/100 [38:39<12:53, 30.93s/it]

best score: -117.91
Epoch 75:


 76%|███████▌  | 76/100 [39:10<12:22, 30.92s/it]

best score: -118.0
Epoch 76:


 77%|███████▋  | 77/100 [39:40<11:51, 30.92s/it]

best score: -117.91
Epoch 77:


 78%|███████▊  | 78/100 [40:11<11:20, 30.92s/it]

best score: -126.73
Epoch 78:


 79%|███████▉  | 79/100 [40:42<10:49, 30.92s/it]

best score: -117.91
Epoch 79:


 80%|████████  | 80/100 [41:12<10:18, 30.91s/it]

best score: -108.91
Epoch 80:


 81%|████████  | 81/100 [41:44<09:47, 30.91s/it]

best score: -117.91
Epoch 81:


 82%|████████▏ | 82/100 [42:14<09:16, 30.91s/it]

best score: -109.0
Epoch 82:


 83%|████████▎ | 83/100 [42:43<08:45, 30.89s/it]

best score: -117.82
Epoch 83:


 84%|████████▍ | 84/100 [43:16<08:14, 30.91s/it]

best score: -108.91
Epoch 84:


 85%|████████▌ | 85/100 [43:47<07:43, 30.91s/it]

best score: -100.0
Epoch 85:


 86%|████████▌ | 86/100 [44:19<07:13, 30.93s/it]

best score: -100.0
Epoch 86:


 87%|████████▋ | 87/100 [44:50<06:42, 30.93s/it]

best score: -100.0
Epoch 87:


 88%|████████▊ | 88/100 [45:21<06:11, 30.93s/it]

best score: -100.0
Epoch 88:


 89%|████████▉ | 89/100 [45:52<05:40, 30.93s/it]

best score: -100.0
Epoch 89:


 90%|█████████ | 90/100 [46:24<05:09, 30.94s/it]

best score: -100.0
Epoch 90:


 91%|█████████ | 91/100 [46:55<04:38, 30.94s/it]

best score: -100.0
Epoch 91:


 92%|█████████▏| 92/100 [47:27<04:07, 30.95s/it]

best score: -100.0
Epoch 92:


 93%|█████████▎| 93/100 [48:00<03:36, 30.97s/it]

best score: -100.0
Epoch 93:


 94%|█████████▍| 94/100 [48:32<03:05, 30.99s/it]

best score: -100.0
Epoch 94:


 95%|█████████▌| 95/100 [49:06<02:35, 31.01s/it]

best score: -100.0
Epoch 95:


 96%|█████████▌| 96/100 [49:36<02:04, 31.01s/it]

best score: -100.0
Epoch 96:


 97%|█████████▋| 97/100 [50:07<01:33, 31.00s/it]

best score: -100.0
Epoch 97:


 98%|█████████▊| 98/100 [50:36<01:01, 30.98s/it]

best score: -100.0
Epoch 98:


 99%|█████████▉| 99/100 [51:10<00:31, 31.01s/it]

best score: -100.0
Epoch 99:


100%|██████████| 100/100 [51:43<00:00, 31.03s/it]

best score: -100.0



