# Use Off-Policy VPG to Play CartPole-v0

TensorFlow version

In [1]:
%matplotlib inline

import sys
import logging
import imp
import itertools

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import tensorflow.compat.v2 as tf
tf.random.set_seed(0)
from tensorflow import keras
from tensorflow import nn
from tensorflow import optimizers
from tensorflow.keras import layers
from tensorflow.keras import losses

imp.reload(logging)
logging.basicConfig(level=logging.DEBUG,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

In [2]:
env = gym.make('CartPole-v0')
env.seed(0)
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])

22:31:54 [INFO] env: <CartPoleEnv<CartPole-v0>>
22:31:54 [INFO] action_space: Discrete(2)
22:31:54 [INFO] observation_space: Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
22:31:54 [INFO] reward_range: (-inf, inf)
22:31:54 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 50}
22:31:54 [INFO] _max_episode_steps: 200
22:31:54 [INFO] _elapsed_steps: None


In [3]:
class OffPolicyVPGAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.gamma = 0.99

        def dot(y_true, y_pred):
            return -tf.reduce_sum(y_true * y_pred, axis=-1)

        self.policy_net = self.build_net(hidden_sizes=[],
                output_size=self.action_n,
                output_activation=nn.softmax,
                loss=dot, learning_rate=0.06)
    
    def build_net(self, hidden_sizes, output_size,
            activation=nn.relu, output_activation=None,
            use_bias=False, loss=losses.mse, learning_rate=0.001):
        model = keras.Sequential()
        for hidden_size in hidden_sizes:
            model.add(layers.Dense(units=hidden_size,
                    activation=activation, use_bias=use_bias))
        model.add(layers.Dense(units=output_size,
                activation=output_activation, use_bias=use_bias))
        optimizer = optimizers.Adam(learning_rate)
        model.compile(optimizer=optimizer, loss=loss)
        return model
      
    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':            
            self.trajectory = []

    def step(self, observation, reward, done):
        if self.mode == 'train':
            action = np.random.choice(self.action_n) # use random policy
            self.trajectory += [observation, reward, done, action]
        else:
            probs = self.policy_net.predict(observation[np.newaxis])[0]
            action = np.random.choice(self.action_n, p=probs)
        return action

    def close(self):
        if self.mode == 'train':            
            self.learn()

    def learn(self):
        df = pd.DataFrame(np.array(self.trajectory, dtype=object).reshape(-1, 4),
                columns=['state', 'reward', 'done', 'action'])
        df['discount'] = self.gamma ** df.index.to_series()
        df['discounted_reward'] = df['discount'] * df['reward'].astype(float)
        df['discounted_return'] = df['discounted_reward'][::-1].cumsum()
        states = np.stack(df['state'])
        actions = np.eye(self.action_n)[df['action'].astype(int)]
        df['behavior_prob'] = 1. / self.action_n
        df['sample_weight'] = df['discounted_return'] / df['behavior_prob']
        sample_weight = df[['sample_weight',]].values
        self.policy_net.fit(states, actions, sample_weight=sample_weight, verbose=0)


agent = OffPolicyVPGAgent(env)

In [None]:
def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):
    observation, reward, done = env.reset(), 0., False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, done)
        if render:
            env.render()
        if done:
            break
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
        if max_episode_steps and elapsed_steps >= max_episode_steps:
            break
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train & verify ====')
episode_rewards = []
for episode in itertools.count():
    play_episode(env.unwrapped, agent,
            max_episode_steps=env._max_episode_steps, mode='train')
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.debug('verify episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-10:]) > 195:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.debug('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))

22:31:55 [INFO] ==== train & verify ====
22:31:55 [INFO] NumExpr defaulting to 8 threads.
22:32:05 [DEBUG] verify episode 0: reward = 34.00, steps = 34
22:32:08 [DEBUG] verify episode 1: reward = 16.00, steps = 16
22:32:12 [DEBUG] verify episode 2: reward = 20.00, steps = 20
22:32:15 [DEBUG] verify episode 3: reward = 17.00, steps = 17
22:32:21 [DEBUG] verify episode 4: reward = 26.00, steps = 26
22:32:23 [DEBUG] verify episode 5: reward = 12.00, steps = 12
22:32:29 [DEBUG] verify episode 6: reward = 25.00, steps = 25
22:32:32 [DEBUG] verify episode 7: reward = 19.00, steps = 19
22:32:41 [DEBUG] verify episode 8: reward = 37.00, steps = 37
22:32:47 [DEBUG] verify episode 9: reward = 27.00, steps = 27
22:32:50 [DEBUG] verify episode 10: reward = 16.00, steps = 16
22:32:54 [DEBUG] verify episode 11: reward = 23.00, steps = 23
22:32:55 [DEBUG] verify episode 12: reward = 12.00, steps = 12
22:32:59 [DEBUG] verify episode 13: reward = 24.00, steps = 24
22:33:01 [DEBUG] verify episode 14: re

22:48:53 [DEBUG] verify episode 129: reward = 59.00, steps = 59
22:49:00 [DEBUG] verify episode 130: reward = 53.00, steps = 53
22:49:20 [DEBUG] verify episode 131: reward = 145.00, steps = 145
22:49:32 [DEBUG] verify episode 132: reward = 93.00, steps = 93
22:49:43 [DEBUG] verify episode 133: reward = 85.00, steps = 85
22:49:54 [DEBUG] verify episode 134: reward = 81.00, steps = 81
22:49:59 [DEBUG] verify episode 135: reward = 40.00, steps = 40
22:50:08 [DEBUG] verify episode 136: reward = 56.00, steps = 56
22:50:18 [DEBUG] verify episode 137: reward = 59.00, steps = 59
22:50:26 [DEBUG] verify episode 138: reward = 52.00, steps = 52
22:50:31 [DEBUG] verify episode 139: reward = 39.00, steps = 39
22:50:53 [DEBUG] verify episode 140: reward = 143.00, steps = 143
22:51:11 [DEBUG] verify episode 141: reward = 104.00, steps = 104
22:51:22 [DEBUG] verify episode 142: reward = 65.00, steps = 65
22:51:30 [DEBUG] verify episode 143: reward = 50.00, steps = 50
22:51:41 [DEBUG] verify episode 14

23:13:32 [DEBUG] verify episode 257: reward = 121.00, steps = 121
23:13:41 [DEBUG] verify episode 258: reward = 75.00, steps = 75
23:13:47 [DEBUG] verify episode 259: reward = 42.00, steps = 42
23:13:51 [DEBUG] verify episode 260: reward = 41.00, steps = 41
23:14:06 [DEBUG] verify episode 261: reward = 123.00, steps = 123
23:14:13 [DEBUG] verify episode 262: reward = 58.00, steps = 58
23:14:20 [DEBUG] verify episode 263: reward = 58.00, steps = 58
23:14:32 [DEBUG] verify episode 264: reward = 105.00, steps = 105
23:14:40 [DEBUG] verify episode 265: reward = 61.00, steps = 61
23:14:47 [DEBUG] verify episode 266: reward = 63.00, steps = 63
23:14:56 [DEBUG] verify episode 267: reward = 74.00, steps = 74
23:15:05 [DEBUG] verify episode 268: reward = 71.00, steps = 71
23:15:12 [DEBUG] verify episode 269: reward = 56.00, steps = 56
23:15:26 [DEBUG] verify episode 270: reward = 113.00, steps = 113
23:15:33 [DEBUG] verify episode 271: reward = 60.00, steps = 60
23:15:43 [DEBUG] verify episode 

23:35:53 [DEBUG] verify episode 385: reward = 92.00, steps = 92
23:36:00 [DEBUG] verify episode 386: reward = 66.00, steps = 66
23:36:10 [DEBUG] verify episode 387: reward = 101.00, steps = 101
23:36:19 [DEBUG] verify episode 388: reward = 86.00, steps = 86
23:36:28 [DEBUG] verify episode 389: reward = 92.00, steps = 92
23:36:36 [DEBUG] verify episode 390: reward = 73.00, steps = 73
23:36:42 [DEBUG] verify episode 391: reward = 60.00, steps = 60
23:36:49 [DEBUG] verify episode 392: reward = 71.00, steps = 71
23:37:01 [DEBUG] verify episode 393: reward = 109.00, steps = 109
23:37:11 [DEBUG] verify episode 394: reward = 95.00, steps = 95
23:37:18 [DEBUG] verify episode 395: reward = 74.00, steps = 74
23:37:30 [DEBUG] verify episode 396: reward = 117.00, steps = 117
23:37:41 [DEBUG] verify episode 397: reward = 110.00, steps = 110
23:37:50 [DEBUG] verify episode 398: reward = 89.00, steps = 89
23:38:01 [DEBUG] verify episode 399: reward = 103.00, steps = 103
23:38:12 [DEBUG] verify episod

23:58:16 [DEBUG] verify episode 512: reward = 90.00, steps = 90
23:58:25 [DEBUG] verify episode 513: reward = 88.00, steps = 88
23:58:34 [DEBUG] verify episode 514: reward = 92.00, steps = 92
23:58:43 [DEBUG] verify episode 515: reward = 89.00, steps = 89
23:58:49 [DEBUG] verify episode 516: reward = 59.00, steps = 59
23:58:57 [DEBUG] verify episode 517: reward = 82.00, steps = 82
23:59:05 [DEBUG] verify episode 518: reward = 77.00, steps = 77
23:59:14 [DEBUG] verify episode 519: reward = 90.00, steps = 90
23:59:26 [DEBUG] verify episode 520: reward = 116.00, steps = 116
23:59:33 [DEBUG] verify episode 521: reward = 69.00, steps = 69
23:59:46 [DEBUG] verify episode 522: reward = 124.00, steps = 124
00:00:02 [DEBUG] verify episode 523: reward = 169.00, steps = 169
00:00:13 [DEBUG] verify episode 524: reward = 104.00, steps = 104
00:00:20 [DEBUG] verify episode 525: reward = 76.00, steps = 76
00:00:28 [DEBUG] verify episode 526: reward = 75.00, steps = 75
00:00:40 [DEBUG] verify episode 

00:21:37 [DEBUG] verify episode 638: reward = 125.00, steps = 125
00:21:46 [DEBUG] verify episode 639: reward = 87.00, steps = 87
00:21:55 [DEBUG] verify episode 640: reward = 95.00, steps = 95
00:22:03 [DEBUG] verify episode 641: reward = 81.00, steps = 81
00:22:13 [DEBUG] verify episode 642: reward = 100.00, steps = 100
00:22:28 [DEBUG] verify episode 643: reward = 143.00, steps = 143
00:22:36 [DEBUG] verify episode 644: reward = 79.00, steps = 79
00:22:52 [DEBUG] verify episode 645: reward = 153.00, steps = 153
00:23:08 [DEBUG] verify episode 646: reward = 166.00, steps = 166
00:23:17 [DEBUG] verify episode 647: reward = 92.00, steps = 92
00:23:30 [DEBUG] verify episode 648: reward = 129.00, steps = 129
00:23:41 [DEBUG] verify episode 649: reward = 112.00, steps = 112
00:23:50 [DEBUG] verify episode 650: reward = 90.00, steps = 90
00:23:58 [DEBUG] verify episode 651: reward = 75.00, steps = 75
00:24:12 [DEBUG] verify episode 652: reward = 143.00, steps = 143
00:24:22 [DEBUG] verify 

00:48:15 [DEBUG] verify episode 763: reward = 200.00, steps = 200
00:48:29 [DEBUG] verify episode 764: reward = 144.00, steps = 144
00:48:42 [DEBUG] verify episode 765: reward = 154.00, steps = 154
00:48:59 [DEBUG] verify episode 766: reward = 195.00, steps = 195
00:49:12 [DEBUG] verify episode 767: reward = 148.00, steps = 148
00:49:24 [DEBUG] verify episode 768: reward = 149.00, steps = 149
00:49:35 [DEBUG] verify episode 769: reward = 128.00, steps = 128
00:49:46 [DEBUG] verify episode 770: reward = 126.00, steps = 126
00:49:59 [DEBUG] verify episode 771: reward = 150.00, steps = 150
00:50:10 [DEBUG] verify episode 772: reward = 127.00, steps = 127
00:50:26 [DEBUG] verify episode 773: reward = 200.00, steps = 200
00:50:40 [DEBUG] verify episode 774: reward = 162.00, steps = 162
00:50:57 [DEBUG] verify episode 775: reward = 200.00, steps = 200
00:51:00 [DEBUG] verify episode 776: reward = 23.00, steps = 23
00:51:12 [DEBUG] verify episode 777: reward = 146.00, steps = 146
00:51:28 [DE

01:13:45 [DEBUG] verify episode 888: reward = 200.00, steps = 200
01:13:58 [DEBUG] verify episode 889: reward = 152.00, steps = 152
01:14:10 [DEBUG] verify episode 890: reward = 140.00, steps = 140
01:14:22 [DEBUG] verify episode 891: reward = 131.00, steps = 131
01:14:34 [DEBUG] verify episode 892: reward = 141.00, steps = 141
01:14:45 [DEBUG] verify episode 893: reward = 126.00, steps = 126
01:14:53 [DEBUG] verify episode 894: reward = 97.00, steps = 97
01:15:07 [DEBUG] verify episode 895: reward = 152.00, steps = 152
01:15:19 [DEBUG] verify episode 896: reward = 148.00, steps = 148
01:15:28 [DEBUG] verify episode 897: reward = 109.00, steps = 109
01:15:45 [DEBUG] verify episode 898: reward = 200.00, steps = 200
01:15:53 [DEBUG] verify episode 899: reward = 101.00, steps = 101
01:16:05 [DEBUG] verify episode 900: reward = 140.00, steps = 140
01:16:15 [DEBUG] verify episode 901: reward = 134.00, steps = 134
01:16:29 [DEBUG] verify episode 902: reward = 166.00, steps = 166
01:16:45 [DE

01:38:22 [DEBUG] verify episode 1013: reward = 83.00, steps = 83
01:38:33 [DEBUG] verify episode 1014: reward = 127.00, steps = 127
01:38:48 [DEBUG] verify episode 1015: reward = 163.00, steps = 163
01:39:03 [DEBUG] verify episode 1016: reward = 166.00, steps = 166
01:39:14 [DEBUG] verify episode 1017: reward = 130.00, steps = 130
01:39:22 [DEBUG] verify episode 1018: reward = 95.00, steps = 95
01:39:34 [DEBUG] verify episode 1019: reward = 134.00, steps = 134
01:39:46 [DEBUG] verify episode 1020: reward = 139.00, steps = 139
01:39:54 [DEBUG] verify episode 1021: reward = 96.00, steps = 96
01:40:08 [DEBUG] verify episode 1022: reward = 155.00, steps = 155
01:40:17 [DEBUG] verify episode 1023: reward = 107.00, steps = 107
01:40:26 [DEBUG] verify episode 1024: reward = 107.00, steps = 107
01:40:36 [DEBUG] verify episode 1025: reward = 105.00, steps = 105
01:40:48 [DEBUG] verify episode 1026: reward = 147.00, steps = 147
01:40:59 [DEBUG] verify episode 1027: reward = 125.00, steps = 125
0

02:01:04 [DEBUG] verify episode 1136: reward = 122.00, steps = 122
02:01:12 [DEBUG] verify episode 1137: reward = 106.00, steps = 106
02:01:21 [DEBUG] verify episode 1138: reward = 101.00, steps = 101
02:01:31 [DEBUG] verify episode 1139: reward = 110.00, steps = 110
02:01:45 [DEBUG] verify episode 1140: reward = 168.00, steps = 168
02:02:02 [DEBUG] verify episode 1141: reward = 200.00, steps = 200
02:02:12 [DEBUG] verify episode 1142: reward = 123.00, steps = 123
02:02:24 [DEBUG] verify episode 1143: reward = 138.00, steps = 138
02:02:34 [DEBUG] verify episode 1144: reward = 116.00, steps = 116
02:02:48 [DEBUG] verify episode 1145: reward = 162.00, steps = 162
02:03:00 [DEBUG] verify episode 1146: reward = 137.00, steps = 137
02:03:15 [DEBUG] verify episode 1147: reward = 173.00, steps = 173
02:03:25 [DEBUG] verify episode 1148: reward = 116.00, steps = 116
02:03:34 [DEBUG] verify episode 1149: reward = 106.00, steps = 106
02:03:43 [DEBUG] verify episode 1150: reward = 110.00, steps =

02:23:20 [DEBUG] verify episode 1259: reward = 146.00, steps = 146
02:23:32 [DEBUG] verify episode 1260: reward = 139.00, steps = 139
02:23:46 [DEBUG] verify episode 1261: reward = 167.00, steps = 167
02:23:56 [DEBUG] verify episode 1262: reward = 123.00, steps = 123
02:24:07 [DEBUG] verify episode 1263: reward = 127.00, steps = 127
02:24:24 [DEBUG] verify episode 1264: reward = 200.00, steps = 200
02:24:41 [DEBUG] verify episode 1265: reward = 197.00, steps = 197
02:24:51 [DEBUG] verify episode 1266: reward = 111.00, steps = 111
02:25:01 [DEBUG] verify episode 1267: reward = 122.00, steps = 122
02:25:11 [DEBUG] verify episode 1268: reward = 109.00, steps = 109
02:25:27 [DEBUG] verify episode 1269: reward = 183.00, steps = 183
02:25:36 [DEBUG] verify episode 1270: reward = 111.00, steps = 111
02:25:48 [DEBUG] verify episode 1271: reward = 135.00, steps = 135
02:25:57 [DEBUG] verify episode 1272: reward = 102.00, steps = 102
02:26:05 [DEBUG] verify episode 1273: reward = 91.00, steps = 

02:48:02 [DEBUG] verify episode 1382: reward = 115.00, steps = 115
02:48:17 [DEBUG] verify episode 1383: reward = 166.00, steps = 166
02:48:27 [DEBUG] verify episode 1384: reward = 123.00, steps = 123
02:48:45 [DEBUG] verify episode 1385: reward = 200.00, steps = 200
02:48:58 [DEBUG] verify episode 1386: reward = 154.00, steps = 154
02:49:09 [DEBUG] verify episode 1387: reward = 132.00, steps = 132
02:49:26 [DEBUG] verify episode 1388: reward = 200.00, steps = 200
02:49:40 [DEBUG] verify episode 1389: reward = 169.00, steps = 169
02:49:57 [DEBUG] verify episode 1390: reward = 200.00, steps = 200
02:50:07 [DEBUG] verify episode 1391: reward = 114.00, steps = 114
02:50:18 [DEBUG] verify episode 1392: reward = 132.00, steps = 132
02:50:31 [DEBUG] verify episode 1393: reward = 148.00, steps = 148
02:50:42 [DEBUG] verify episode 1394: reward = 132.00, steps = 132
02:50:58 [DEBUG] verify episode 1395: reward = 189.00, steps = 189
02:51:11 [DEBUG] verify episode 1396: reward = 152.00, steps =

03:14:52 [DEBUG] verify episode 1505: reward = 161.00, steps = 161
03:15:03 [DEBUG] verify episode 1506: reward = 137.00, steps = 137
03:15:15 [DEBUG] verify episode 1507: reward = 139.00, steps = 139
03:15:25 [DEBUG] verify episode 1508: reward = 122.00, steps = 122
03:15:41 [DEBUG] verify episode 1509: reward = 200.00, steps = 200
03:15:57 [DEBUG] verify episode 1510: reward = 200.00, steps = 200
03:16:07 [DEBUG] verify episode 1511: reward = 127.00, steps = 127
03:16:17 [DEBUG] verify episode 1512: reward = 125.00, steps = 125
03:16:29 [DEBUG] verify episode 1513: reward = 152.00, steps = 152
03:16:39 [DEBUG] verify episode 1514: reward = 118.00, steps = 118
03:16:55 [DEBUG] verify episode 1515: reward = 197.00, steps = 197
03:17:08 [DEBUG] verify episode 1516: reward = 169.00, steps = 169
03:17:23 [DEBUG] verify episode 1517: reward = 183.00, steps = 183
03:17:38 [DEBUG] verify episode 1518: reward = 189.00, steps = 189
03:17:47 [DEBUG] verify episode 1519: reward = 110.00, steps =

03:40:53 [DEBUG] verify episode 1628: reward = 200.00, steps = 200
03:41:03 [DEBUG] verify episode 1629: reward = 113.00, steps = 113
03:41:18 [DEBUG] verify episode 1630: reward = 195.00, steps = 195
03:41:27 [DEBUG] verify episode 1631: reward = 106.00, steps = 106
03:41:35 [DEBUG] verify episode 1632: reward = 105.00, steps = 105
03:41:51 [DEBUG] verify episode 1633: reward = 189.00, steps = 189
03:42:04 [DEBUG] verify episode 1634: reward = 166.00, steps = 166
03:42:18 [DEBUG] verify episode 1635: reward = 179.00, steps = 179
03:42:31 [DEBUG] verify episode 1636: reward = 154.00, steps = 154
03:42:43 [DEBUG] verify episode 1637: reward = 151.00, steps = 151
03:42:59 [DEBUG] verify episode 1638: reward = 200.00, steps = 200
03:43:15 [DEBUG] verify episode 1639: reward = 200.00, steps = 200
03:43:25 [DEBUG] verify episode 1640: reward = 118.00, steps = 118
03:43:41 [DEBUG] verify episode 1641: reward = 200.00, steps = 200
03:43:56 [DEBUG] verify episode 1642: reward = 191.00, steps =

04:06:35 [DEBUG] verify episode 1751: reward = 155.00, steps = 155
04:06:44 [DEBUG] verify episode 1752: reward = 110.00, steps = 110
04:06:59 [DEBUG] verify episode 1753: reward = 189.00, steps = 189
04:07:11 [DEBUG] verify episode 1754: reward = 156.00, steps = 156
04:07:22 [DEBUG] verify episode 1755: reward = 135.00, steps = 135
04:07:38 [DEBUG] verify episode 1756: reward = 188.00, steps = 188
04:07:51 [DEBUG] verify episode 1757: reward = 162.00, steps = 162
04:08:03 [DEBUG] verify episode 1758: reward = 157.00, steps = 157
04:08:12 [DEBUG] verify episode 1759: reward = 112.00, steps = 112
04:08:25 [DEBUG] verify episode 1760: reward = 158.00, steps = 158
04:08:36 [DEBUG] verify episode 1761: reward = 131.00, steps = 131
04:08:47 [DEBUG] verify episode 1762: reward = 136.00, steps = 136
04:09:02 [DEBUG] verify episode 1763: reward = 182.00, steps = 182
04:09:14 [DEBUG] verify episode 1764: reward = 158.00, steps = 158
04:09:31 [DEBUG] verify episode 1765: reward = 200.00, steps =

04:34:12 [DEBUG] verify episode 1874: reward = 165.00, steps = 165
04:34:28 [DEBUG] verify episode 1875: reward = 200.00, steps = 200
04:34:43 [DEBUG] verify episode 1876: reward = 200.00, steps = 200
04:34:59 [DEBUG] verify episode 1877: reward = 200.00, steps = 200
04:35:15 [DEBUG] verify episode 1878: reward = 196.00, steps = 196
04:35:30 [DEBUG] verify episode 1879: reward = 195.00, steps = 195
04:35:47 [DEBUG] verify episode 1880: reward = 200.00, steps = 200
04:36:01 [DEBUG] verify episode 1881: reward = 177.00, steps = 177
04:36:12 [DEBUG] verify episode 1882: reward = 144.00, steps = 144
04:36:24 [DEBUG] verify episode 1883: reward = 146.00, steps = 146
04:36:38 [DEBUG] verify episode 1884: reward = 179.00, steps = 179
04:36:52 [DEBUG] verify episode 1885: reward = 176.00, steps = 176
04:37:08 [DEBUG] verify episode 1886: reward = 198.00, steps = 198
04:37:24 [DEBUG] verify episode 1887: reward = 200.00, steps = 200
04:37:38 [DEBUG] verify episode 1888: reward = 171.00, steps =

05:03:48 [DEBUG] test episode 4: reward = 172.00, steps = 172
05:04:03 [DEBUG] test episode 5: reward = 189.00, steps = 189
05:04:20 [DEBUG] test episode 6: reward = 200.00, steps = 200
05:04:37 [DEBUG] test episode 7: reward = 200.00, steps = 200
05:04:53 [DEBUG] test episode 8: reward = 200.00, steps = 200
05:05:10 [DEBUG] test episode 9: reward = 200.00, steps = 200
05:05:26 [DEBUG] test episode 10: reward = 200.00, steps = 200
05:05:43 [DEBUG] test episode 11: reward = 200.00, steps = 200
05:05:59 [DEBUG] test episode 12: reward = 200.00, steps = 200
05:06:16 [DEBUG] test episode 13: reward = 200.00, steps = 200
05:06:31 [DEBUG] test episode 14: reward = 186.00, steps = 186
05:06:47 [DEBUG] test episode 15: reward = 200.00, steps = 200
05:07:04 [DEBUG] test episode 16: reward = 200.00, steps = 200
05:07:21 [DEBUG] test episode 17: reward = 200.00, steps = 200
05:07:36 [DEBUG] test episode 18: reward = 186.00, steps = 186
05:07:51 [DEBUG] test episode 19: reward = 174.00, steps = 17

In [None]:
env.close()