In [1]:
import os
import argparse
import datetime

import torch
from torch import nn
import numpy as np
import gymnasium as gym

from tianshou.data import Collector, Batch, to_torch
from tianshou.data.types import RolloutBatchProtocol
from tianshou.data.buffer.vecbuf import VectorReplayBuffer, ReplayBuffer
from tianshou.env import SubprocVectorEnv
from tianshou.policy import SACPolicy, BasePolicy
from tianshou.utils.net.common import Net
from tianshou.utils.net.continuous import ActorProb, Critic
from examples.offline.utils import load_buffer_d4rl
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger
from tianshou.trainer import OffpolicyTrainer

/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/glfw/__init__.py:916: GLFWError: (65544) b'X11: The DISPLAY environment variable is missing'
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/gymnasium/envs/registration.py", line 594, in load_plugin_envs
    fn()
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/shimmy/registration.py", line 303, in register_gymnasium_envs
    _register_dm_control_envs()
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/shimmy/registration.py", line 63, in _register_dm_control_envs
    from shimmy.dm_control_compatibility import DmControlCompatibilityV0
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/shimmy/dm_control_compatibility.py", line 13, in <module>
    import dm_env
ModuleNotFoundError: No module named 'dm_env'
[0m
  logger.warn(f"plugin: {plugin.value} raised {traceback.format_exc()}")
No module named 'mjrl'
No module named 'flow'
No modu

In [2]:
device = "cuda:3"

In [3]:
def get_sac_args():
    args = argparse.Namespace(
        task="Hopper-v2",
        buffer_size=1000000,
        hidden_sizes=[256, 256, 256],
        actor_lr=3e-4,
        critic_lr=3e-4,
        gamma=0.99,
        tau=0.005,
        alpha=0.1,
        start_timesteps=1,
        epoch=200,
        step_per_epoch=5000,
        step_per_collect=1,
        update_per_step=1,
        batch_size=256,
        training_num=1,
        test_num=10,
        device=device,
        norm_layer=True,
    )
    return args

In [4]:
args = get_sac_args()
env = gym.make(args.task)

  logger.deprecation(
  logger.deprecation(


In [5]:
def load_policy(path):
    args = get_sac_args()
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # model
    net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device)
    actor = ActorProb(
        net_a,
        args.action_shape,
        device=args.device,
        unbounded=True,
        conditioned_sigma=True,
    ).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    net_c1 = Net(
        args.state_shape,
        args.action_shape,
        hidden_sizes=args.hidden_sizes,
        concat=True,
        device=args.device,
        norm_layer=nn.LayerNorm if args.norm_layer else None
    )
    net_c2 = Net(
        args.state_shape,
        args.action_shape,
        hidden_sizes=args.hidden_sizes,
        concat=True,
        device=args.device,
        norm_layer=nn.LayerNorm if args.norm_layer else None
    )
    critic1 = Critic(net_c1, device=args.device).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    critic2 = Critic(net_c2, device=args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
    policy = SACPolicy(
        actor,
        actor_optim,
        critic1,
        critic1_optim,
        critic2,
        critic2_optim,
        tau=args.tau,
        gamma=args.gamma,
        alpha=args.alpha,
        action_space=env.action_space,
    )
    policy.load_state_dict(torch.load(path, map_location=args.device))
    print("Loaded agent from: ", path)
    return policy

In [6]:
cql_policy = load_policy("/data/user/R901105/dev/log/Hopper-v2/cql/0/231220-111219/policy.pth")

Loaded agent from:  /data/user/R901105/dev/log/Hopper-v2/cql/0/231220-111219/policy.pth


In [7]:
sac_policy = load_policy("/data/user/R901105/dev/log/Hopper-v2/sac/0/231219-163624/policy.pth")

Loaded agent from:  /data/user/R901105/dev/log/Hopper-v2/sac/0/231219-163624/policy.pth


  logger.deprecation(
  logger.deprecation(


In [8]:
envs = SubprocVectorEnv([lambda: gym.make(args.task) for _ in range(10)])

In [11]:
cql_policy.eval()
cql_collector = Collector(cql_policy, envs)
cql_result = cql_collector.collect(n_episode=40)
cql_result

{'n/ep': 40,
 'n/st': 29361,
 'rews': array([1489.02481167, 1842.21185885, 1851.54305439, 1895.97065876,
        2116.01603433, 2210.91089589, 2403.31832445, 2503.1987029 ,
        3191.59420128, 3298.24723624, 1874.35273658, 1882.85695515,
        1927.46393929, 3121.14735053, 2855.48096741, 2810.40096311,
        2802.10252388, 2675.67930741, 1928.14911438, 2158.49601966,
        2433.0786285 , 3336.26060991, 2102.4805384 , 2180.70791761,
        1875.98584251, 3289.97193547, 2985.8541246 , 3122.38828219,
        3333.63136507, 2438.59254587, 1625.10616891, 3222.07709866,
        3020.51567111, 2684.40105251, 2162.56228329, 2420.0371619 ,
        1604.1628094 , 1564.49197052, 3314.09554543, 2200.40869673]),
 'lens': array([ 457,  559,  562,  575,  638,  669,  717,  760,  946, 1000,  566,
         573,  584,  926,  856,  836,  842,  799,  583,  653,  727,  996,
         629,  652,  567, 1000,  894,  936, 1000,  734,  496,  958,  893,
         798,  651,  724,  484,  475,  980,  666]),

In [9]:
sac_policy.eval()
sac_collector = Collector(sac_policy, envs)
sac_result = sac_collector.collect(n_episode=40)
sac_result

{'n/ep': 40,
 'n/st': 38784,
 'rews': array([1374.48870402, 3563.14717783, 3557.86708206, 3608.04713137,
        3548.05139858, 3553.03855107, 3536.79429071, 3562.16889125,
        3618.95147274, 3517.54973909, 3552.13703624, 3566.05168652,
        3574.5577577 , 3612.51821892, 3572.29116713, 3547.21679116,
        3545.03262353, 3689.45713996, 3574.47625481, 3566.61674521,
        3599.40311662, 3639.44434494, 3526.78525297, 3546.06227241,
        3518.65236275, 3586.98774992, 3611.27413258, 3619.59866963,
        3563.55801092, 3556.8661537 , 1370.73038917, 3551.69977827,
        3572.18721047, 3557.7686918 , 3541.53283945, 3593.61391383,
        3567.80575847, 3569.23964549, 3636.30429332, 3574.10492874]),
 'lens': array([ 393, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
        1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
        1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,  391, 1000, 1000,
        1000, 1000, 1000, 1000, 1000, 1000, 1000]),

In [81]:
from tianshou.data import ReplayBuffer
from tianshou.data.types import RolloutBatchProtocol


class MixedPolicy(BasePolicy):
    def __init__(self, base_policy: BasePolicy, expert_policy: BasePolicy, action_space, buffer):
        super().__init__(action_space=action_space, action_scaling=True)
        self.base_policy = base_policy
        self.expert_policy = expert_policy
        self.device = device
        self.buffer = buffer

    def forward(self, batch: RolloutBatchProtocol, state=None, **kwargs):
        bsz = len(batch.obs)
        batch = to_torch(batch, dtype=torch.float32, device=self.device)
        self.base_policy.eval()
        self.expert_policy.eval()
        with torch.no_grad():
            expert_result = self.expert_policy(batch)
            base_result = self.base_policy(batch)
            expert_qvalues1 = self.expert_policy.critic1(batch.obs, expert_result.act)
            expert_qvalues2 = self.expert_policy.critic2(batch.obs, expert_result.act)
            expert_qvalues = torch.minimum(expert_qvalues1, expert_qvalues2)
            base_qvalues1 = self.base_policy.critic1(batch.obs, base_result.act)
            base_qvalues2 = self.base_policy.critic2(batch.obs, base_result.act)
            base_qvalues = torch.minimum(base_qvalues1, base_qvalues2)
        cede_ctrl = base_qvalues < expert_qvalues
        actions = torch.where(cede_ctrl, expert_result.act, base_result.act)
        return Batch(**{'act': actions, 'policy': Batch({'cede_ctrl': cede_ctrl})})

    def train(self, mode: bool = True) -> "MixedPolicy":
        self.base_policy.train(mode)
        return self
    
    def process_fn(self, batch: RolloutBatchProtocol, buffer: ReplayBuffer, indices: np.ndarray) -> RolloutBatchProtocol:
        return self.base_policy.process_fn(batch, buffer, indices)

    def learn(self, batch, **kwargs):
        # cede_ctrl = batch.policy.cede_ctrl.cpu().squeeze()
        # train_batch = batch[~cede_ctrl]
        # info = self.base_policy.learn(train_batch)
        info = self.base_policy.learn(batch)
        return info

In [12]:
test_buffer = VectorReplayBuffer(5000, 5)

In [82]:
mixed_policy = MixedPolicy(cql_policy, sac_policy, env.action_space, test_buffer)

In [12]:
mixed_collector = Collector(mixed_policy, envs)
mixed_result = mixed_collector.collect(n_episode=40)
mixed_result

{'n/ep': 40,
 'n/st': 39442,
 'rews': array([2877.85420345, 3579.38976417, 3566.2887685 , 3612.26257751,
        3648.73333248, 3655.45149677, 3559.89113674, 3552.6419363 ,
        3588.0665408 , 3578.10143437, 3576.98877615, 3561.46943533,
        3558.70658515, 3598.58161738, 3558.16991895, 3525.51417879,
        3579.98865515, 3558.74737203, 3518.58784384, 3641.33416581,
        3563.76800552, 3231.92112228, 3558.71492095, 3535.73037386,
        3558.96929922, 3583.61728139, 3556.20513785, 3542.30987626,
        3584.5683532 , 3573.77252436, 2903.35971768, 3574.50541953,
        3551.28549583, 3540.74278544, 3513.75433021, 3546.94601085,
        3569.82593565, 3520.37927402, 3546.47417846, 3562.03266267]),
 'lens': array([ 798, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
        1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,  869,
        1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,  775, 1000, 1000,
        1000, 1000, 1000, 1000, 1000, 1000, 1000]),

In [13]:
class DeterministicPolicy(BasePolicy):
    def __init__(self, policy, action_space):
        super().__init__(action_space=action_space, action_scaling=True)
        self.policy = policy

    def train(self, mode: bool = True) -> "DeterministicPolicy":
        self.policy.train(mode)
        return self
    
    def forward(self, batch, state=None, **kwargs):
        self.policy.eval()
        return self.policy(batch)
    
    def learn(self, batch, **kwargs):
        info = self.policy.learn(batch)
        return info
    
    def process_fn(self, batch: RolloutBatchProtocol, buffer: ReplayBuffer, indices: np.ndarray) -> RolloutBatchProtocol:
        return self.policy.process_fn(batch, buffer, indices)

In [14]:
det_cql = DeterministicPolicy(cql_policy, env.action_space)

In [9]:
test_envs = SubprocVectorEnv([lambda: gym.make(args.task) for _ in range(5)])

In [10]:
offline_data = load_buffer_d4rl("hopper-medium-v2")

  logger.warn(
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
load datafile:   0%|          | 0/21 [00:00<?, ?it/s]

load datafile: 100%|██████████| 21/21 [00:01<00:00, 17.56it/s]


In [12]:
policy = cql_policy

In [13]:
test_buffer = VectorReplayBuffer(5000, 5)
train_collector = Collector(policy, env, offline_data)
test_collector = Collector(policy, test_envs, test_buffer)



In [61]:
test_collector.collect(n_episode=1)

{'n/ep': 1,
 'n/st': 1000,
 'rews': array([3536.64228072]),
 'lens': array([1000]),
 'idxs': array([0]),
 'rew': 3536.6422807150416,
 'len': 1000.0,
 'rew_std': 0.0,
 'len_std': 0.0}

In [14]:
# log
now = datetime.datetime.now().strftime("%y%m%d-%H%M%S")
log_name = os.path.join(args.task, policy.__class__.__name__, now)
log_path = os.path.join("/data/user/R901105/dev/log", log_name)
writer = SummaryWriter(log_path)
logger = TensorboardLogger(writer)
print(log_path)

/data/user/R901105/dev/log/Hopper-v2/SACPolicy/240122-103110


In [15]:
def get_returns(policy, return_cede_ctrl=False):
    policy.eval()
    returns = []
    cede_ctrl = []
    for _ in range(5):
        obs, info = env.reset()
        done = False
        cum_reward = 0
        actions = []
        if return_cede_ctrl:
            cctrl_ep = []
        while not done:
            batch = Batch(obs=torch.from_numpy(np.expand_dims(obs, 0)).to(device), info=info)
            with torch.no_grad():
                result = policy(batch)
            act = result.act.cpu().squeeze().numpy()
            act = policy.map_action(act)
            if return_cede_ctrl:
                cctrl_ep.append(result.policy.cede_ctrl.cpu().squeeze().numpy())
            obs, reward, terminated, truncated, info = env.step(act)
            actions.append(act)
            cum_reward += reward
            done = terminated or truncated
        returns.append(cum_reward)
        if return_cede_ctrl:
            cede_ctrl.append(np.array(cctrl_ep).mean())
    if return_cede_ctrl:
        return np.array(returns),  np.array(cede_ctrl).mean()
    return np.array(returns), _

In [16]:
def test_fn(num_epoch: int, step_idx: int):
    returns, cede_ctrl = get_returns(policy, True)
    print(returns.mean(), cede_ctrl.mean())

In [17]:
result = OffpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    # test_fn=test_fn,
    max_epoch=200,
    step_per_epoch=1000,
    step_per_collect=1,
    episode_per_test=1,
    batch_size=512,
    logger=logger,
    update_per_step=1,
    test_in_train=False,
).run()

Epoch #1:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch #1: 1001it [00:16, 60.77it/s, env_step=1000, gradient_step=1000, len=98, loss/actor=-268.244, loss/critic1=3.523, loss/critic2=3.553, n/ep=0, n/st=1, rew=229.69]                          


Epoch #1: test_reward: 3272.901825 ± 0.000000, best_reward: 3272.901825 ± 0.000000 in #1


Epoch #2: 1001it [00:14, 67.96it/s, env_step=2000, gradient_step=2000, len=1000, loss/actor=-267.586, loss/critic1=3.446, loss/critic2=3.416, n/ep=0, n/st=1, rew=3283.77]                          


Epoch #2: test_reward: 3240.190149 ± 0.000000, best_reward: 3272.901825 ± 0.000000 in #1


Epoch #3: 1001it [00:14, 68.14it/s, env_step=3000, gradient_step=3000, len=1000, loss/actor=-268.251, loss/critic1=3.231, loss/critic2=3.309, n/ep=0, n/st=1, rew=3191.68]                          


Epoch #3: test_reward: 3207.268600 ± 0.000000, best_reward: 3272.901825 ± 0.000000 in #1


Epoch #4: 1001it [00:14, 66.93it/s, env_step=4000, gradient_step=4000, len=1000, loss/actor=-268.185, loss/critic1=3.043, loss/critic2=3.014, n/ep=0, n/st=1, rew=3177.90]                          


Epoch #4: test_reward: 3217.511513 ± 0.000000, best_reward: 3272.901825 ± 0.000000 in #1


Epoch #5: 1001it [00:14, 67.21it/s, env_step=5000, gradient_step=5000, len=1000, loss/actor=-268.051, loss/critic1=3.196, loss/critic2=3.327, n/ep=0, n/st=1, rew=3178.53]                          


Epoch #5: test_reward: 102.969942 ± 0.000000, best_reward: 3272.901825 ± 0.000000 in #1


Epoch #6: 1001it [00:14, 67.17it/s, env_step=6000, gradient_step=6000, len=1000, loss/actor=-269.086, loss/critic1=2.605, loss/critic2=2.644, n/ep=0, n/st=1, rew=3061.52]                          


Epoch #6: test_reward: 74.055295 ± 0.000000, best_reward: 3272.901825 ± 0.000000 in #1


Epoch #7: 1001it [00:15, 65.51it/s, env_step=7000, gradient_step=7000, len=61, loss/actor=-269.794, loss/critic1=2.680, loss/critic2=2.714, n/ep=0, n/st=1, rew=112.95]                            


Epoch #7: test_reward: 90.542162 ± 0.000000, best_reward: 3272.901825 ± 0.000000 in #1


Epoch #8: 1001it [00:14, 68.82it/s, env_step=8000, gradient_step=8000, len=54, loss/actor=-269.974, loss/critic1=2.437, loss/critic2=2.517, n/ep=0, n/st=1, rew=93.69]                          


Epoch #8: test_reward: 2651.085975 ± 0.000000, best_reward: 3272.901825 ± 0.000000 in #1


Epoch #9: 1001it [00:14, 68.95it/s, env_step=9000, gradient_step=9000, len=1000, loss/actor=-271.216, loss/critic1=2.112, loss/critic2=2.307, n/ep=0, n/st=1, rew=3101.13]                          


Epoch #9: test_reward: 2972.885071 ± 0.000000, best_reward: 3272.901825 ± 0.000000 in #1


Epoch #10: 1001it [00:14, 68.46it/s, env_step=10000, gradient_step=10000, len=1000, loss/actor=-272.153, loss/critic1=3.540, loss/critic2=3.467, n/ep=0, n/st=1, rew=3157.27]                          


Epoch #10: test_reward: 3215.472366 ± 0.000000, best_reward: 3272.901825 ± 0.000000 in #1


Epoch #11: 1001it [00:14, 68.52it/s, env_step=11000, gradient_step=11000, len=1000, loss/actor=-272.609, loss/critic1=3.640, loss/critic2=3.564, n/ep=0, n/st=1, rew=2976.86]                          


Epoch #11: test_reward: 3215.716396 ± 0.000000, best_reward: 3272.901825 ± 0.000000 in #1


Epoch #12: 1001it [00:14, 68.23it/s, env_step=12000, gradient_step=12000, len=1000, loss/actor=-273.492, loss/critic1=4.989, loss/critic2=4.794, n/ep=0, n/st=1, rew=3207.94]                          


Epoch #12: test_reward: 3264.880940 ± 0.000000, best_reward: 3272.901825 ± 0.000000 in #1


Epoch #13: 1001it [00:14, 68.85it/s, env_step=13000, gradient_step=13000, len=361, loss/actor=-274.114, loss/critic1=3.336, loss/critic2=3.387, n/ep=0, n/st=1, rew=1256.48]                          


Epoch #13: test_reward: 3100.022716 ± 0.000000, best_reward: 3272.901825 ± 0.000000 in #1


Epoch #14: 1001it [00:14, 68.84it/s, env_step=14000, gradient_step=14000, len=1000, loss/actor=-274.056, loss/critic1=4.874, loss/critic2=4.897, n/ep=0, n/st=1, rew=3248.87]                           


Epoch #14: test_reward: 3289.400435 ± 0.000000, best_reward: 3289.400435 ± 0.000000 in #14


Epoch #15: 1001it [00:14, 68.72it/s, env_step=15000, gradient_step=15000, len=788, loss/actor=-274.495, loss/critic1=4.837, loss/critic2=4.777, n/ep=0, n/st=1, rew=2616.96]                          


Epoch #15: test_reward: 1880.506727 ± 0.000000, best_reward: 3289.400435 ± 0.000000 in #14


Epoch #16: 1001it [00:14, 68.68it/s, env_step=16000, gradient_step=16000, len=1000, loss/actor=-274.422, loss/critic1=3.046, loss/critic2=3.052, n/ep=0, n/st=1, rew=3214.29]                          


Epoch #16: test_reward: 3355.426911 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #17: 1001it [00:14, 68.97it/s, env_step=17000, gradient_step=17000, len=1000, loss/actor=-275.541, loss/critic1=5.155, loss/critic2=4.970, n/ep=0, n/st=1, rew=3235.87]                          


Epoch #17: test_reward: 3329.949877 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #18: 1001it [00:14, 68.93it/s, env_step=18000, gradient_step=18000, len=1000, loss/actor=-276.209, loss/critic1=2.769, loss/critic2=2.750, n/ep=0, n/st=1, rew=3256.02]                           


Epoch #18: test_reward: 16.269508 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #19: 1001it [00:14, 69.10it/s, env_step=19000, gradient_step=19000, len=13, loss/actor=-276.807, loss/critic1=1.116, loss/critic2=1.032, n/ep=0, n/st=1, rew=12.38]                             


Epoch #19: test_reward: 12.000884 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #20: 1001it [00:14, 69.21it/s, env_step=20000, gradient_step=20000, len=643, loss/actor=-276.227, loss/critic1=9.210, loss/critic2=9.140, n/ep=0, n/st=1, rew=2119.95]                           


Epoch #20: test_reward: 3278.537856 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #21: 1001it [00:14, 68.32it/s, env_step=21000, gradient_step=21000, len=129, loss/actor=-276.652, loss/critic1=3.778, loss/critic2=3.759, n/ep=0, n/st=1, rew=309.11]                          


Epoch #21: test_reward: 1491.410363 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #22: 1001it [00:14, 69.26it/s, env_step=22000, gradient_step=22000, len=501, loss/actor=-276.538, loss/critic1=4.974, loss/critic2=5.063, n/ep=0, n/st=1, rew=1679.71]                          


Epoch #22: test_reward: 3316.218706 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #23: 1001it [00:14, 69.04it/s, env_step=23000, gradient_step=23000, len=394, loss/actor=-276.411, loss/critic1=5.169, loss/critic2=5.257, n/ep=0, n/st=1, rew=1306.35]                          


Epoch #23: test_reward: 3116.622615 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #24: 1001it [00:15, 66.69it/s, env_step=24000, gradient_step=24000, len=391, loss/actor=-276.832, loss/critic1=4.881, loss/critic2=4.661, n/ep=0, n/st=1, rew=1257.45]                           


Epoch #24: test_reward: 1199.516715 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #25: 1001it [00:14, 68.90it/s, env_step=25000, gradient_step=25000, len=917, loss/actor=-277.272, loss/critic1=4.703, loss/critic2=4.805, n/ep=0, n/st=1, rew=3013.73]                          


Epoch #25: test_reward: 3175.035064 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #26: 1001it [00:14, 68.80it/s, env_step=26000, gradient_step=26000, len=978, loss/actor=-277.423, loss/critic1=8.631, loss/critic2=8.511, n/ep=0, n/st=1, rew=3204.20]                          


Epoch #26: test_reward: 1863.783378 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #27: 1001it [00:14, 68.63it/s, env_step=27000, gradient_step=27000, len=714, loss/actor=-277.282, loss/critic1=1.012, loss/critic2=1.028, n/ep=0, n/st=1, rew=2343.68]                           


Epoch #27: test_reward: 1535.934910 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #28: 1001it [00:14, 69.13it/s, env_step=28000, gradient_step=28000, len=836, loss/actor=-277.527, loss/critic1=7.556, loss/critic2=7.366, n/ep=0, n/st=1, rew=2724.84]                          


Epoch #28: test_reward: 2287.694482 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #29: 1001it [00:14, 68.57it/s, env_step=29000, gradient_step=29000, len=774, loss/actor=-278.067, loss/critic1=1.249, loss/critic2=1.366, n/ep=0, n/st=1, rew=2499.37]                           


Epoch #29: test_reward: 472.431683 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #30: 1001it [00:14, 68.72it/s, env_step=30000, gradient_step=30000, len=426, loss/actor=-278.735, loss/critic1=2.187, loss/critic2=2.261, n/ep=0, n/st=1, rew=1426.95]                          


Epoch #30: test_reward: 1306.094558 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #31: 1001it [00:14, 69.12it/s, env_step=31000, gradient_step=31000, len=369, loss/actor=-279.037, loss/critic1=3.304, loss/critic2=3.264, n/ep=0, n/st=1, rew=1285.48]                          


Epoch #31: test_reward: 474.532779 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #32: 1001it [00:14, 69.15it/s, env_step=32000, gradient_step=32000, len=1000, loss/actor=-278.434, loss/critic1=5.025, loss/critic2=5.021, n/ep=0, n/st=1, rew=3198.79]                          


Epoch #32: test_reward: 3209.289269 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #33: 1001it [00:14, 68.88it/s, env_step=33000, gradient_step=33000, len=404, loss/actor=-279.187, loss/critic1=1.291, loss/critic2=1.160, n/ep=0, n/st=1, rew=1354.68]                          


Epoch #33: test_reward: 1333.464306 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #34: 1001it [00:14, 67.89it/s, env_step=34000, gradient_step=34000, len=471, loss/actor=-279.543, loss/critic1=3.750, loss/critic2=3.576, n/ep=0, n/st=1, rew=1596.60]                          


Epoch #34: test_reward: 3210.779552 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #35: 1001it [00:15, 64.10it/s, env_step=35000, gradient_step=35000, len=910, loss/actor=-279.471, loss/critic1=2.229, loss/critic2=1.957, n/ep=0, n/st=1, rew=3018.21]                           


Epoch #35: test_reward: 1373.212708 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #36: 1001it [00:15, 63.82it/s, env_step=36000, gradient_step=36000, len=1000, loss/actor=-279.349, loss/critic1=2.611, loss/critic2=2.599, n/ep=0, n/st=1, rew=3181.72]                           


Epoch #36: test_reward: 2320.941690 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #37: 1001it [00:15, 65.15it/s, env_step=37000, gradient_step=37000, len=383, loss/actor=-279.205, loss/critic1=1.140, loss/critic2=1.085, n/ep=0, n/st=1, rew=1241.90]                           


Epoch #37: test_reward: 452.928241 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #38: 1001it [00:15, 65.93it/s, env_step=38000, gradient_step=38000, len=164, loss/actor=-279.631, loss/critic1=3.108, loss/critic2=3.120, n/ep=0, n/st=1, rew=431.29]                           


Epoch #38: test_reward: 3221.173825 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #39: 1001it [00:14, 68.70it/s, env_step=39000, gradient_step=39000, len=797, loss/actor=-279.155, loss/critic1=3.245, loss/critic2=3.143, n/ep=0, n/st=1, rew=2555.38]                           


Epoch #39: test_reward: 3246.724984 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #40: 1001it [00:14, 68.65it/s, env_step=40000, gradient_step=40000, len=180, loss/actor=-278.981, loss/critic1=7.677, loss/critic2=7.714, n/ep=0, n/st=1, rew=486.48]                            


Epoch #40: test_reward: 3237.173221 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #41: 1001it [00:14, 68.90it/s, env_step=41000, gradient_step=41000, len=606, loss/actor=-279.216, loss/critic1=6.649, loss/critic2=6.702, n/ep=0, n/st=1, rew=2030.22]                           


Epoch #41: test_reward: 3281.924066 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #42: 1001it [00:14, 69.00it/s, env_step=42000, gradient_step=42000, len=158, loss/actor=-279.423, loss/critic1=6.667, loss/critic2=6.586, n/ep=0, n/st=1, rew=401.42]                           


Epoch #42: test_reward: 266.972624 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #43: 1001it [00:14, 69.06it/s, env_step=43000, gradient_step=43000, len=259, loss/actor=-279.547, loss/critic1=12.863, loss/critic2=12.034, n/ep=0, n/st=1, rew=731.88]                           


Epoch #43: test_reward: 204.160478 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #44: 1001it [00:14, 69.05it/s, env_step=44000, gradient_step=44000, len=96, loss/actor=-279.854, loss/critic1=8.050, loss/critic2=7.940, n/ep=0, n/st=1, rew=204.02]                             


Epoch #44: test_reward: 3265.883988 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #45: 1001it [00:14, 68.89it/s, env_step=45000, gradient_step=45000, len=224, loss/actor=-280.029, loss/critic1=14.248, loss/critic2=14.194, n/ep=0, n/st=1, rew=647.98]                          


Epoch #45: test_reward: 309.372813 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #46: 1001it [00:14, 69.08it/s, env_step=46000, gradient_step=46000, len=1000, loss/actor=-280.993, loss/critic1=9.926, loss/critic2=10.149, n/ep=0, n/st=1, rew=3189.22]                          


Epoch #46: test_reward: 3287.438624 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #47: 1001it [00:14, 68.69it/s, env_step=47000, gradient_step=47000, len=1000, loss/actor=-280.998, loss/critic1=6.660, loss/critic2=6.716, n/ep=0, n/st=1, rew=3186.78]                           


Epoch #47: test_reward: 2963.005242 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #48: 1001it [00:14, 68.96it/s, env_step=48000, gradient_step=48000, len=292, loss/actor=-281.725, loss/critic1=11.112, loss/critic2=11.164, n/ep=0, n/st=1, rew=975.85]                           


Epoch #48: test_reward: 3166.422860 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #49: 1001it [00:14, 68.83it/s, env_step=49000, gradient_step=49000, len=1000, loss/actor=-282.469, loss/critic1=7.570, loss/critic2=7.380, n/ep=0, n/st=1, rew=3209.30]                           


Epoch #49: test_reward: 3247.798289 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #50: 1001it [00:14, 68.94it/s, env_step=50000, gradient_step=50000, len=1000, loss/actor=-282.794, loss/critic1=3.364, loss/critic2=3.711, n/ep=0, n/st=1, rew=3258.89]                           


Epoch #50: test_reward: 3338.775610 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #51: 1001it [00:14, 68.76it/s, env_step=51000, gradient_step=51000, len=391, loss/actor=-282.588, loss/critic1=4.651, loss/critic2=4.553, n/ep=0, n/st=1, rew=1346.04]                            


Epoch #51: test_reward: 1039.215681 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #52: 1001it [00:14, 68.90it/s, env_step=52000, gradient_step=52000, len=880, loss/actor=-283.132, loss/critic1=1.072, loss/critic2=1.073, n/ep=0, n/st=1, rew=2870.10]                           


Epoch #52: test_reward: 8.381614 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #53: 1001it [00:14, 69.02it/s, env_step=53000, gradient_step=53000, len=10, loss/actor=-282.855, loss/critic1=11.470, loss/critic2=12.580, n/ep=0, n/st=1, rew=7.73]                             


Epoch #53: test_reward: 3240.419431 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #54: 1001it [00:14, 68.88it/s, env_step=54000, gradient_step=54000, len=631, loss/actor=-283.715, loss/critic1=7.167, loss/critic2=7.141, n/ep=0, n/st=1, rew=2091.28]                            


Epoch #54: test_reward: 2864.001814 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #55: 1001it [00:14, 68.92it/s, env_step=55000, gradient_step=55000, len=1000, loss/actor=-283.915, loss/critic1=12.446, loss/critic2=12.421, n/ep=0, n/st=1, rew=3322.00]                          


Epoch #55: test_reward: 3306.835831 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #56: 1001it [00:14, 68.89it/s, env_step=56000, gradient_step=56000, len=542, loss/actor=-284.536, loss/critic1=14.932, loss/critic2=14.957, n/ep=0, n/st=1, rew=1802.05]                          


Epoch #56: test_reward: 1581.556924 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #57: 1001it [00:14, 68.84it/s, env_step=57000, gradient_step=57000, len=1000, loss/actor=-284.677, loss/critic1=7.194, loss/critic2=6.187, n/ep=0, n/st=1, rew=3301.08]                           


Epoch #57: test_reward: 3272.169625 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #58: 1001it [00:14, 67.86it/s, env_step=58000, gradient_step=58000, len=1000, loss/actor=-284.665, loss/critic1=13.914, loss/critic2=14.797, n/ep=0, n/st=1, rew=3325.65]                          


Epoch #58: test_reward: 3305.769787 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #59: 1001it [00:14, 69.04it/s, env_step=59000, gradient_step=59000, len=430, loss/actor=-285.465, loss/critic1=5.042, loss/critic2=4.773, n/ep=0, n/st=1, rew=1498.81]                            


Epoch #59: test_reward: 2073.493867 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #60: 1001it [00:14, 68.57it/s, env_step=60000, gradient_step=60000, len=524, loss/actor=-285.271, loss/critic1=9.607, loss/critic2=9.274, n/ep=0, n/st=1, rew=1831.11]                           


Epoch #60: test_reward: 2945.753413 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #61: 1001it [00:14, 68.84it/s, env_step=61000, gradient_step=61000, len=719, loss/actor=-285.313, loss/critic1=9.788, loss/critic2=9.816, n/ep=0, n/st=1, rew=2493.91]                           


Epoch #61: test_reward: 3296.657503 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #62: 1001it [00:14, 68.78it/s, env_step=62000, gradient_step=62000, len=1000, loss/actor=-285.557, loss/critic1=8.879, loss/critic2=8.603, n/ep=0, n/st=1, rew=3283.57]                           


Epoch #62: test_reward: 3289.846766 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #63: 1001it [00:14, 68.10it/s, env_step=63000, gradient_step=63000, len=1000, loss/actor=-285.335, loss/critic1=9.625, loss/critic2=9.513, n/ep=0, n/st=1, rew=3284.81]                           


Epoch #63: test_reward: 357.122232 ± 0.000000, best_reward: 3355.426911 ± 0.000000 in #16


Epoch #64: 1001it [00:14, 68.88it/s, env_step=64000, gradient_step=64000, len=1000, loss/actor=-285.814, loss/critic1=4.601, loss/critic2=4.641, n/ep=0, n/st=1, rew=3309.03]                           


Epoch #64: test_reward: 3359.536204 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #65: 1001it [00:14, 68.53it/s, env_step=65000, gradient_step=65000, len=967, loss/actor=-285.361, loss/critic1=14.604, loss/critic2=14.745, n/ep=0, n/st=1, rew=3306.09]                          


Epoch #65: test_reward: 3337.280448 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #66: 1001it [00:14, 68.75it/s, env_step=66000, gradient_step=66000, len=1000, loss/actor=-285.523, loss/critic1=8.657, loss/critic2=8.669, n/ep=0, n/st=1, rew=3362.57]                          


Epoch #66: test_reward: 2582.582257 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #67: 1001it [00:14, 68.34it/s, env_step=67000, gradient_step=67000, len=948, loss/actor=-286.387, loss/critic1=7.151, loss/critic2=7.124, n/ep=0, n/st=1, rew=3266.88]                            


Epoch #67: test_reward: 97.886572 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #68: 1001it [00:14, 69.00it/s, env_step=68000, gradient_step=68000, len=761, loss/actor=-286.133, loss/critic1=13.463, loss/critic2=13.514, n/ep=0, n/st=1, rew=2599.11]                          


Epoch #68: test_reward: 3283.081782 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #69: 1001it [00:14, 68.43it/s, env_step=69000, gradient_step=69000, len=727, loss/actor=-286.899, loss/critic1=11.226, loss/critic2=11.270, n/ep=0, n/st=1, rew=2464.41]                          


Epoch #69: test_reward: 2319.400412 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #70: 1001it [00:14, 68.71it/s, env_step=70000, gradient_step=70000, len=764, loss/actor=-286.605, loss/critic1=9.868, loss/critic2=9.874, n/ep=0, n/st=1, rew=2540.67]                           


Epoch #70: test_reward: 1250.954604 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #71: 1001it [00:14, 68.71it/s, env_step=71000, gradient_step=71000, len=364, loss/actor=-286.551, loss/critic1=3.197, loss/critic2=3.169, n/ep=0, n/st=1, rew=1240.69]                           


Epoch #71: test_reward: 3221.594339 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #72: 1001it [00:14, 67.75it/s, env_step=72000, gradient_step=72000, len=330, loss/actor=-286.791, loss/critic1=11.922, loss/critic2=12.038, n/ep=0, n/st=1, rew=1017.52]                          


Epoch #72: test_reward: 1118.941889 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #73: 1001it [00:14, 68.68it/s, env_step=73000, gradient_step=73000, len=361, loss/actor=-286.316, loss/critic1=11.951, loss/critic2=11.543, n/ep=0, n/st=1, rew=1206.99]                          


Epoch #73: test_reward: 1124.149405 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #74: 1001it [00:14, 68.50it/s, env_step=74000, gradient_step=74000, len=761, loss/actor=-287.105, loss/critic1=21.898, loss/critic2=20.588, n/ep=0, n/st=1, rew=2598.98]                          


Epoch #74: test_reward: 1393.794642 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #75: 1001it [00:14, 68.92it/s, env_step=75000, gradient_step=75000, len=211, loss/actor=-287.547, loss/critic1=6.728, loss/critic2=6.876, n/ep=0, n/st=1, rew=576.77]                             


Epoch #75: test_reward: 1667.737970 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #76: 1001it [00:14, 68.47it/s, env_step=76000, gradient_step=76000, len=1000, loss/actor=-287.483, loss/critic1=9.501, loss/critic2=9.674, n/ep=0, n/st=1, rew=3312.67]                           


Epoch #76: test_reward: 1255.515178 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #77: 1001it [00:14, 67.88it/s, env_step=77000, gradient_step=77000, len=860, loss/actor=-287.626, loss/critic1=4.711, loss/critic2=4.772, n/ep=0, n/st=1, rew=2927.83]                            


Epoch #77: test_reward: 3325.463980 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #78: 1001it [00:14, 68.60it/s, env_step=78000, gradient_step=78000, len=589, loss/actor=-287.986, loss/critic1=7.986, loss/critic2=7.975, n/ep=0, n/st=1, rew=2007.12]                           


Epoch #78: test_reward: 2184.299291 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #79: 1001it [00:14, 67.68it/s, env_step=79000, gradient_step=79000, len=424, loss/actor=-287.639, loss/critic1=7.058, loss/critic2=7.030, n/ep=0, n/st=1, rew=1441.15]                           


Epoch #79: test_reward: 2414.171443 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #80: 1001it [00:14, 68.46it/s, env_step=80000, gradient_step=80000, len=631, loss/actor=-288.344, loss/critic1=7.498, loss/critic2=7.766, n/ep=0, n/st=1, rew=2134.01]                           


Epoch #80: test_reward: 3281.293458 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #81: 1001it [00:14, 68.56it/s, env_step=81000, gradient_step=81000, len=328, loss/actor=-287.957, loss/critic1=5.480, loss/critic2=5.443, n/ep=0, n/st=1, rew=1002.10]                            


Epoch #81: test_reward: 3309.712769 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #82: 1001it [00:14, 68.85it/s, env_step=82000, gradient_step=82000, len=1000, loss/actor=-288.561, loss/critic1=10.445, loss/critic2=10.315, n/ep=0, n/st=1, rew=3216.78]                          


Epoch #82: test_reward: 3111.485623 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #83: 1001it [00:14, 68.62it/s, env_step=83000, gradient_step=83000, len=1000, loss/actor=-288.315, loss/critic1=10.688, loss/critic2=10.426, n/ep=0, n/st=1, rew=3260.85]                          


Epoch #83: test_reward: 454.030967 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #84: 1001it [00:14, 68.96it/s, env_step=84000, gradient_step=84000, len=175, loss/actor=-288.714, loss/critic1=13.108, loss/critic2=12.544, n/ep=0, n/st=1, rew=459.60]                          


Epoch #84: test_reward: 3338.698187 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #85: 1001it [00:14, 68.17it/s, env_step=85000, gradient_step=85000, len=500, loss/actor=-288.933, loss/critic1=6.877, loss/critic2=6.393, n/ep=0, n/st=1, rew=1703.17]                           


Epoch #85: test_reward: 266.137785 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #86: 1001it [00:14, 68.29it/s, env_step=86000, gradient_step=86000, len=178, loss/actor=-289.090, loss/critic1=12.258, loss/critic2=12.161, n/ep=0, n/st=1, rew=485.30]                          


Epoch #86: test_reward: 927.593304 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #87: 1001it [00:14, 68.21it/s, env_step=87000, gradient_step=87000, len=397, loss/actor=-289.097, loss/critic1=4.131, loss/critic2=3.977, n/ep=0, n/st=1, rew=1319.49]                           


Epoch #87: test_reward: 466.394154 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #88: 1001it [00:14, 68.24it/s, env_step=88000, gradient_step=88000, len=643, loss/actor=-289.632, loss/critic1=6.037, loss/critic2=6.073, n/ep=0, n/st=1, rew=2210.05]                           


Epoch #88: test_reward: 3323.399927 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #89: 1001it [00:14, 68.49it/s, env_step=89000, gradient_step=89000, len=501, loss/actor=-288.356, loss/critic1=2.278, loss/critic2=2.008, n/ep=0, n/st=1, rew=1707.68]                           


Epoch #89: test_reward: 3358.636455 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #90: 1001it [00:14, 68.77it/s, env_step=90000, gradient_step=90000, len=637, loss/actor=-289.292, loss/critic1=6.725, loss/critic2=7.038, n/ep=0, n/st=1, rew=2180.41]                           


Epoch #90: test_reward: 356.474018 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #91: 1001it [00:14, 68.64it/s, env_step=91000, gradient_step=91000, len=973, loss/actor=-290.116, loss/critic1=12.628, loss/critic2=12.910, n/ep=0, n/st=1, rew=3333.96]                          


Epoch #91: test_reward: 3174.690817 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #92: 1001it [00:14, 68.78it/s, env_step=92000, gradient_step=92000, len=330, loss/actor=-289.885, loss/critic1=9.013, loss/critic2=9.451, n/ep=0, n/st=1, rew=1129.52]                            


Epoch #92: test_reward: 1967.611333 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #93: 1001it [00:14, 68.52it/s, env_step=93000, gradient_step=93000, len=266, loss/actor=-290.339, loss/critic1=9.862, loss/critic2=9.847, n/ep=0, n/st=1, rew=924.40]                            


Epoch #93: test_reward: 758.988311 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #94: 1001it [00:14, 68.42it/s, env_step=94000, gradient_step=94000, len=336, loss/actor=-290.017, loss/critic1=11.970, loss/critic2=11.905, n/ep=0, n/st=1, rew=1161.97]                          


Epoch #94: test_reward: 798.323545 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #95: 1001it [00:14, 67.48it/s, env_step=95000, gradient_step=95000, len=105, loss/actor=-289.450, loss/critic1=8.999, loss/critic2=9.395, n/ep=0, n/st=1, rew=239.17]                            


Epoch #95: test_reward: 3182.323760 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #96: 1001it [00:14, 68.16it/s, env_step=96000, gradient_step=96000, len=266, loss/actor=-290.292, loss/critic1=10.703, loss/critic2=10.716, n/ep=0, n/st=1, rew=774.98]                           


Epoch #96: test_reward: 3283.601195 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #97: 1001it [00:14, 69.10it/s, env_step=97000, gradient_step=97000, len=519, loss/actor=-290.252, loss/critic1=11.061, loss/critic2=10.865, n/ep=0, n/st=1, rew=1605.64]                          


Epoch #97: test_reward: 741.428246 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #98: 1001it [00:14, 68.90it/s, env_step=98000, gradient_step=98000, len=761, loss/actor=-290.538, loss/critic1=14.116, loss/critic2=14.124, n/ep=0, n/st=1, rew=2415.67]                          


Epoch #98: test_reward: 1899.378887 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #99: 1001it [00:14, 68.60it/s, env_step=99000, gradient_step=99000, len=256, loss/actor=-289.921, loss/critic1=17.314, loss/critic2=17.108, n/ep=0, n/st=1, rew=752.32]                          


Epoch #99: test_reward: 1349.877570 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #100: 1001it [00:14, 68.85it/s, env_step=100000, gradient_step=100000, len=732, loss/actor=-290.273, loss/critic1=9.790, loss/critic2=9.726, n/ep=0, n/st=1, rew=2364.08]                          


Epoch #100: test_reward: 1847.230438 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #101: 1001it [00:14, 68.32it/s, env_step=101000, gradient_step=101000, len=1000, loss/actor=-290.139, loss/critic1=10.142, loss/critic2=10.098, n/ep=0, n/st=1, rew=3251.54]                          


Epoch #101: test_reward: 3178.229189 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #102: 1001it [00:14, 68.80it/s, env_step=102000, gradient_step=102000, len=249, loss/actor=-289.936, loss/critic1=14.904, loss/critic2=15.317, n/ep=0, n/st=1, rew=726.65]                           


Epoch #102: test_reward: 466.664263 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #103: 1001it [00:14, 68.79it/s, env_step=103000, gradient_step=103000, len=178, loss/actor=-290.401, loss/critic1=15.335, loss/critic2=15.722, n/ep=0, n/st=1, rew=476.83]                          


Epoch #103: test_reward: 475.830122 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #104: 1001it [00:14, 68.90it/s, env_step=104000, gradient_step=104000, len=1000, loss/actor=-290.581, loss/critic1=4.547, loss/critic2=3.906, n/ep=0, n/st=1, rew=3186.57]                           


Epoch #104: test_reward: 3074.175653 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #105: 1001it [00:14, 67.91it/s, env_step=105000, gradient_step=105000, len=481, loss/actor=-290.709, loss/critic1=9.467, loss/critic2=9.020, n/ep=0, n/st=1, rew=1450.58]                            


Epoch #105: test_reward: 3223.504668 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #106: 1001it [00:14, 68.82it/s, env_step=106000, gradient_step=106000, len=1000, loss/actor=-291.112, loss/critic1=23.262, loss/critic2=20.239, n/ep=0, n/st=1, rew=3150.87]                          


Epoch #106: test_reward: 2301.859237 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #107: 1001it [00:14, 68.85it/s, env_step=107000, gradient_step=107000, len=665, loss/actor=-291.685, loss/critic1=9.349, loss/critic2=8.820, n/ep=0, n/st=1, rew=2111.47]                            


Epoch #107: test_reward: 3177.171707 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #108: 1001it [00:14, 68.61it/s, env_step=108000, gradient_step=108000, len=1000, loss/actor=-291.565, loss/critic1=11.970, loss/critic2=12.292, n/ep=0, n/st=1, rew=3157.73]                          


Epoch #108: test_reward: 46.606267 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #109: 1001it [00:14, 68.03it/s, env_step=109000, gradient_step=109000, len=1000, loss/actor=-292.103, loss/critic1=7.784, loss/critic2=10.147, n/ep=0, n/st=1, rew=3270.17]                          


Epoch #109: test_reward: 1251.273078 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #110: 1001it [00:14, 68.45it/s, env_step=110000, gradient_step=110000, len=550, loss/actor=-291.689, loss/critic1=8.785, loss/critic2=8.584, n/ep=0, n/st=1, rew=1713.81]                            


Epoch #110: test_reward: 3221.347448 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #111: 1001it [00:14, 69.00it/s, env_step=111000, gradient_step=111000, len=1000, loss/actor=-292.172, loss/critic1=7.429, loss/critic2=6.158, n/ep=0, n/st=1, rew=3243.14]                           


Epoch #111: test_reward: 3299.995662 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #112: 1001it [00:14, 68.23it/s, env_step=112000, gradient_step=112000, len=386, loss/actor=-292.090, loss/critic1=11.087, loss/critic2=10.849, n/ep=0, n/st=1, rew=1166.57]                          


Epoch #112: test_reward: 3282.287183 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #113: 1001it [00:14, 68.67it/s, env_step=113000, gradient_step=113000, len=490, loss/actor=-291.945, loss/critic1=18.396, loss/critic2=17.896, n/ep=0, n/st=1, rew=1693.75]                          


Epoch #113: test_reward: 3212.594101 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #114: 1001it [00:14, 68.39it/s, env_step=114000, gradient_step=114000, len=295, loss/actor=-292.639, loss/critic1=14.687, loss/critic2=14.401, n/ep=0, n/st=1, rew=1052.76]                          


Epoch #114: test_reward: 944.537845 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #115: 1001it [00:14, 68.51it/s, env_step=115000, gradient_step=115000, len=1000, loss/actor=-292.643, loss/critic1=10.402, loss/critic2=9.891, n/ep=0, n/st=1, rew=3313.86]                          


Epoch #115: test_reward: 3317.727153 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #116: 1001it [00:14, 68.82it/s, env_step=116000, gradient_step=116000, len=1000, loss/actor=-292.410, loss/critic1=20.885, loss/critic2=18.762, n/ep=0, n/st=1, rew=3258.32]                          


Epoch #116: test_reward: 1348.401805 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #117: 1001it [00:14, 68.67it/s, env_step=117000, gradient_step=117000, len=1000, loss/actor=-292.815, loss/critic1=11.369, loss/critic2=10.815, n/ep=0, n/st=1, rew=3237.45]                          


Epoch #117: test_reward: 3335.212867 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #118: 1001it [00:14, 68.73it/s, env_step=118000, gradient_step=118000, len=316, loss/actor=-293.258, loss/critic1=18.916, loss/critic2=16.961, n/ep=0, n/st=1, rew=957.83]                           


Epoch #118: test_reward: 3157.070537 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #119: 1001it [00:14, 68.66it/s, env_step=119000, gradient_step=119000, len=1000, loss/actor=-293.117, loss/critic1=14.853, loss/critic2=14.893, n/ep=0, n/st=1, rew=3316.94]                          


Epoch #119: test_reward: 3337.836273 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #120: 1001it [00:14, 69.01it/s, env_step=120000, gradient_step=120000, len=491, loss/actor=-292.609, loss/critic1=5.629, loss/critic2=5.481, n/ep=0, n/st=1, rew=1659.62]                            


Epoch #120: test_reward: 1498.580162 ± 0.000000, best_reward: 3359.536204 ± 0.000000 in #64


Epoch #121: 1001it [00:14, 67.98it/s, env_step=121000, gradient_step=121000, len=559, loss/actor=-292.349, loss/critic1=18.838, loss/critic2=19.644, n/ep=0, n/st=1, rew=1924.39]                          


Epoch #121: test_reward: 3414.024467 ± 0.000000, best_reward: 3414.024467 ± 0.000000 in #121


Epoch #122: 1001it [00:15, 66.25it/s, env_step=122000, gradient_step=122000, len=1000, loss/actor=-293.083, loss/critic1=15.322, loss/critic2=14.979, n/ep=0, n/st=1, rew=3375.93]                          


Epoch #122: test_reward: 3349.445842 ± 0.000000, best_reward: 3414.024467 ± 0.000000 in #121


Epoch #123: 1001it [00:14, 69.01it/s, env_step=123000, gradient_step=123000, len=1000, loss/actor=-292.813, loss/critic1=10.828, loss/critic2=10.632, n/ep=0, n/st=1, rew=3364.71]                          


Epoch #123: test_reward: 3323.365823 ± 0.000000, best_reward: 3414.024467 ± 0.000000 in #121


Epoch #124: 1001it [00:14, 68.87it/s, env_step=124000, gradient_step=124000, len=1000, loss/actor=-292.904, loss/critic1=7.499, loss/critic2=7.580, n/ep=0, n/st=1, rew=3344.44]                           


Epoch #124: test_reward: 3285.367263 ± 0.000000, best_reward: 3414.024467 ± 0.000000 in #121


Epoch #125: 1001it [00:14, 68.32it/s, env_step=125000, gradient_step=125000, len=434, loss/actor=-292.808, loss/critic1=13.536, loss/critic2=12.924, n/ep=0, n/st=1, rew=1478.20]                          


Epoch #125: test_reward: 3427.857606 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #126: 1001it [00:14, 68.44it/s, env_step=126000, gradient_step=126000, len=1000, loss/actor=-292.558, loss/critic1=17.802, loss/critic2=17.414, n/ep=0, n/st=1, rew=3322.06]                          


Epoch #126: test_reward: 3393.664483 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #127: 1001it [00:14, 67.23it/s, env_step=127000, gradient_step=127000, len=1000, loss/actor=-291.676, loss/critic1=15.048, loss/critic2=14.894, n/ep=0, n/st=1, rew=3342.24]                          


Epoch #127: test_reward: 3324.658623 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #128: 1001it [00:14, 68.42it/s, env_step=128000, gradient_step=128000, len=1000, loss/actor=-291.638, loss/critic1=17.051, loss/critic2=18.133, n/ep=0, n/st=1, rew=3293.54]                          


Epoch #128: test_reward: 64.555193 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #129: 1001it [00:14, 68.63it/s, env_step=129000, gradient_step=129000, len=302, loss/actor=-292.137, loss/critic1=11.519, loss/critic2=12.003, n/ep=0, n/st=1, rew=1052.08]                          


Epoch #129: test_reward: 3327.728038 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #130: 1001it [00:14, 68.77it/s, env_step=130000, gradient_step=130000, len=1000, loss/actor=-291.807, loss/critic1=14.439, loss/critic2=12.056, n/ep=0, n/st=1, rew=3428.13]                          


Epoch #130: test_reward: 3312.299023 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #131: 1001it [00:14, 68.87it/s, env_step=131000, gradient_step=131000, len=1000, loss/actor=-291.586, loss/critic1=16.055, loss/critic2=17.518, n/ep=0, n/st=1, rew=3344.27]                          


Epoch #131: test_reward: 3331.069081 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #132: 1001it [00:14, 68.29it/s, env_step=132000, gradient_step=132000, len=55, loss/actor=-291.969, loss/critic1=6.941, loss/critic2=7.361, n/ep=0, n/st=1, rew=97.85]                               


Epoch #132: test_reward: 3354.205432 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #133: 1001it [00:14, 68.99it/s, env_step=133000, gradient_step=133000, len=1000, loss/actor=-292.364, loss/critic1=13.161, loss/critic2=12.929, n/ep=0, n/st=1, rew=3320.67]                          


Epoch #133: test_reward: 3280.311395 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #134: 1001it [00:14, 68.39it/s, env_step=134000, gradient_step=134000, len=1000, loss/actor=-293.005, loss/critic1=13.760, loss/critic2=12.898, n/ep=0, n/st=1, rew=3299.07]                          


Epoch #134: test_reward: 47.455925 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #135: 1001it [00:14, 68.17it/s, env_step=135000, gradient_step=135000, len=39, loss/actor=-292.956, loss/critic1=9.478, loss/critic2=9.128, n/ep=0, n/st=1, rew=57.69]                               


Epoch #135: test_reward: 620.448613 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #136: 1001it [00:14, 68.28it/s, env_step=136000, gradient_step=136000, len=831, loss/actor=-292.154, loss/critic1=16.437, loss/critic2=15.412, n/ep=0, n/st=1, rew=2861.55]                          


Epoch #136: test_reward: 3337.332747 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #137: 1001it [00:14, 68.62it/s, env_step=137000, gradient_step=137000, len=447, loss/actor=-292.696, loss/critic1=14.337, loss/critic2=13.847, n/ep=0, n/st=1, rew=1504.25]                          


Epoch #137: test_reward: 3397.359400 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #138: 1001it [00:14, 68.76it/s, env_step=138000, gradient_step=138000, len=525, loss/actor=-292.782, loss/critic1=14.515, loss/critic2=14.592, n/ep=0, n/st=1, rew=1690.58]                          


Epoch #138: test_reward: 3289.765647 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #139: 1001it [00:14, 68.42it/s, env_step=139000, gradient_step=139000, len=872, loss/actor=-292.749, loss/critic1=21.105, loss/critic2=21.482, n/ep=0, n/st=1, rew=3091.22]                          


Epoch #139: test_reward: 3391.992805 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #140: 1001it [00:14, 68.98it/s, env_step=140000, gradient_step=140000, len=818, loss/actor=-292.823, loss/critic1=9.295, loss/critic2=8.525, n/ep=0, n/st=1, rew=2833.66]                           


Epoch #140: test_reward: 3377.266382 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #141: 1001it [00:14, 68.45it/s, env_step=141000, gradient_step=141000, len=506, loss/actor=-292.332, loss/critic1=9.972, loss/critic2=9.868, n/ep=0, n/st=1, rew=1762.17]                           


Epoch #141: test_reward: 3344.402435 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #142: 1001it [00:14, 68.90it/s, env_step=142000, gradient_step=142000, len=1000, loss/actor=-291.881, loss/critic1=16.743, loss/critic2=16.615, n/ep=0, n/st=1, rew=3335.20]                          


Epoch #142: test_reward: 3344.326279 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #143: 1001it [00:14, 68.26it/s, env_step=143000, gradient_step=143000, len=931, loss/actor=-292.119, loss/critic1=11.956, loss/critic2=11.835, n/ep=0, n/st=1, rew=3195.24]                          


Epoch #143: test_reward: 3253.795093 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #144: 1001it [00:14, 68.50it/s, env_step=144000, gradient_step=144000, len=1000, loss/actor=-292.204, loss/critic1=13.204, loss/critic2=13.706, n/ep=0, n/st=1, rew=3244.04]                          


Epoch #144: test_reward: 3344.463189 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #145: 1001it [00:14, 67.30it/s, env_step=145000, gradient_step=145000, len=554, loss/actor=-292.494, loss/critic1=17.040, loss/critic2=15.810, n/ep=0, n/st=1, rew=1720.58]                          


Epoch #145: test_reward: 3266.388546 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #146: 1001it [00:14, 68.75it/s, env_step=146000, gradient_step=146000, len=1000, loss/actor=-291.945, loss/critic1=6.290, loss/critic2=5.891, n/ep=0, n/st=1, rew=3314.07]                           


Epoch #146: test_reward: 3316.959231 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #147: 1001it [00:14, 69.01it/s, env_step=147000, gradient_step=147000, len=1000, loss/actor=-292.163, loss/critic1=20.467, loss/critic2=19.616, n/ep=0, n/st=1, rew=3277.45]                          


Epoch #147: test_reward: 3310.641535 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #148: 1001it [00:14, 68.60it/s, env_step=148000, gradient_step=148000, len=1000, loss/actor=-291.979, loss/critic1=29.225, loss/critic2=30.120, n/ep=0, n/st=1, rew=3282.51]                          


Epoch #148: test_reward: 3369.633689 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #149: 1001it [00:14, 68.38it/s, env_step=149000, gradient_step=149000, len=1000, loss/actor=-291.225, loss/critic1=16.387, loss/critic2=15.723, n/ep=0, n/st=1, rew=3329.90]                          


Epoch #149: test_reward: 3305.460015 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #150: 1001it [00:14, 68.05it/s, env_step=150000, gradient_step=150000, len=1000, loss/actor=-291.153, loss/critic1=14.793, loss/critic2=13.591, n/ep=0, n/st=1, rew=3288.98]                          


Epoch #150: test_reward: 1398.973771 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #151: 1001it [00:14, 68.48it/s, env_step=151000, gradient_step=151000, len=283, loss/actor=-291.306, loss/critic1=9.209, loss/critic2=7.646, n/ep=0, n/st=1, rew=970.91]                             


Epoch #151: test_reward: 3363.061927 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #152: 1001it [00:14, 68.19it/s, env_step=152000, gradient_step=152000, len=868, loss/actor=-290.857, loss/critic1=9.164, loss/critic2=6.787, n/ep=0, n/st=1, rew=2860.63]                           


Epoch #152: test_reward: 3345.102880 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #153: 1001it [00:14, 69.07it/s, env_step=153000, gradient_step=153000, len=1000, loss/actor=-290.769, loss/critic1=24.713, loss/critic2=23.943, n/ep=0, n/st=1, rew=3319.79]                          


Epoch #153: test_reward: 3257.526801 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #154: 1001it [00:14, 68.48it/s, env_step=154000, gradient_step=154000, len=1000, loss/actor=-291.082, loss/critic1=7.270, loss/critic2=8.574, n/ep=0, n/st=1, rew=3233.01]                           


Epoch #154: test_reward: 2670.737431 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #155: 1001it [00:14, 68.96it/s, env_step=155000, gradient_step=155000, len=908, loss/actor=-290.951, loss/critic1=13.545, loss/critic2=13.565, n/ep=0, n/st=1, rew=3028.40]                          


Epoch #155: test_reward: 3313.303995 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #156: 1001it [00:14, 68.80it/s, env_step=156000, gradient_step=156000, len=1000, loss/actor=-290.338, loss/critic1=15.641, loss/critic2=16.593, n/ep=0, n/st=1, rew=3318.60]                          


Epoch #156: test_reward: 2341.503812 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #157: 1001it [00:14, 68.70it/s, env_step=157000, gradient_step=157000, len=479, loss/actor=-289.945, loss/critic1=19.474, loss/critic2=19.120, n/ep=0, n/st=1, rew=1644.37]                          


Epoch #157: test_reward: 3207.419659 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #158: 1001it [00:14, 68.10it/s, env_step=158000, gradient_step=158000, len=329, loss/actor=-290.475, loss/critic1=10.888, loss/critic2=10.689, n/ep=0, n/st=1, rew=1150.73]                          


Epoch #158: test_reward: 3398.218130 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #159: 1001it [00:14, 67.93it/s, env_step=159000, gradient_step=159000, len=1000, loss/actor=-290.824, loss/critic1=13.484, loss/critic2=13.187, n/ep=0, n/st=1, rew=3372.17]                          


Epoch #159: test_reward: 3362.624642 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #160: 1001it [00:14, 68.92it/s, env_step=160000, gradient_step=160000, len=473, loss/actor=-291.172, loss/critic1=8.561, loss/critic2=9.349, n/ep=0, n/st=1, rew=1648.43]                            


Epoch #160: test_reward: 1591.622982 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #161: 1001it [00:14, 68.85it/s, env_step=161000, gradient_step=161000, len=820, loss/actor=-290.735, loss/critic1=15.819, loss/critic2=15.798, n/ep=0, n/st=1, rew=2902.69]                          


Epoch #161: test_reward: 3316.422246 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #162: 1001it [00:14, 68.46it/s, env_step=162000, gradient_step=162000, len=1000, loss/actor=-290.856, loss/critic1=20.170, loss/critic2=20.569, n/ep=0, n/st=1, rew=3345.75]                          


Epoch #162: test_reward: 3156.253208 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #163: 1001it [00:14, 68.59it/s, env_step=163000, gradient_step=163000, len=1000, loss/actor=-290.693, loss/critic1=5.302, loss/critic2=6.256, n/ep=0, n/st=1, rew=3257.78]                           


Epoch #163: test_reward: 3350.640278 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #164: 1001it [00:14, 68.95it/s, env_step=164000, gradient_step=164000, len=1000, loss/actor=-290.630, loss/critic1=12.665, loss/critic2=10.749, n/ep=0, n/st=1, rew=3237.74]                          


Epoch #164: test_reward: 3317.126128 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #165: 1001it [00:14, 68.99it/s, env_step=165000, gradient_step=165000, len=1000, loss/actor=-290.981, loss/critic1=14.398, loss/critic2=15.558, n/ep=0, n/st=1, rew=3339.18]                          


Epoch #165: test_reward: 3269.493067 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #166: 1001it [00:14, 69.24it/s, env_step=166000, gradient_step=166000, len=697, loss/actor=-290.801, loss/critic1=9.002, loss/critic2=8.211, n/ep=0, n/st=1, rew=2276.17]                            


Epoch #166: test_reward: 3295.419634 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #167: 1001it [00:14, 68.88it/s, env_step=167000, gradient_step=167000, len=1000, loss/actor=-290.717, loss/critic1=9.639, loss/critic2=11.256, n/ep=0, n/st=1, rew=3332.96]                          


Epoch #167: test_reward: 3345.889441 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #168: 1001it [00:14, 69.21it/s, env_step=168000, gradient_step=168000, len=1000, loss/actor=-290.384, loss/critic1=15.310, loss/critic2=13.164, n/ep=0, n/st=1, rew=3336.73]                          


Epoch #168: test_reward: 3302.148203 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #169: 1001it [00:14, 68.75it/s, env_step=169000, gradient_step=169000, len=789, loss/actor=-290.508, loss/critic1=13.440, loss/critic2=13.168, n/ep=0, n/st=1, rew=2593.01]                          


Epoch #169: test_reward: 3341.262108 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #170: 1001it [00:14, 69.20it/s, env_step=170000, gradient_step=170000, len=752, loss/actor=-290.294, loss/critic1=15.636, loss/critic2=13.900, n/ep=0, n/st=1, rew=2589.12]                          


Epoch #170: test_reward: 3352.847404 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #171: 1001it [00:14, 68.50it/s, env_step=171000, gradient_step=171000, len=1000, loss/actor=-290.711, loss/critic1=11.757, loss/critic2=10.746, n/ep=0, n/st=1, rew=3271.69]                          


Epoch #171: test_reward: 501.507026 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #172: 1001it [00:14, 69.05it/s, env_step=172000, gradient_step=172000, len=1000, loss/actor=-290.846, loss/critic1=16.260, loss/critic2=16.263, n/ep=0, n/st=1, rew=3289.84]                          


Epoch #172: test_reward: 3200.394510 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #173: 1001it [00:14, 68.53it/s, env_step=173000, gradient_step=173000, len=188, loss/actor=-291.094, loss/critic1=13.159, loss/critic2=12.713, n/ep=0, n/st=1, rew=510.55]                           


Epoch #173: test_reward: 3304.094476 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #174: 1001it [00:14, 69.19it/s, env_step=174000, gradient_step=174000, len=1000, loss/actor=-290.680, loss/critic1=17.013, loss/critic2=16.366, n/ep=0, n/st=1, rew=3321.53]                          


Epoch #174: test_reward: 3220.472590 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #175: 1001it [00:14, 68.95it/s, env_step=175000, gradient_step=175000, len=1000, loss/actor=-291.432, loss/critic1=17.274, loss/critic2=18.192, n/ep=0, n/st=1, rew=3279.05]                          


Epoch #175: test_reward: 3374.337750 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #176: 1001it [00:14, 68.81it/s, env_step=176000, gradient_step=176000, len=1000, loss/actor=-290.061, loss/critic1=20.691, loss/critic2=19.616, n/ep=0, n/st=1, rew=3348.28]                          


Epoch #176: test_reward: 3355.879778 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #177: 1001it [00:14, 68.94it/s, env_step=177000, gradient_step=177000, len=700, loss/actor=-290.050, loss/critic1=13.882, loss/critic2=15.035, n/ep=0, n/st=1, rew=2419.73]                          


Epoch #177: test_reward: 3372.655653 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #178: 1001it [00:14, 69.18it/s, env_step=178000, gradient_step=178000, len=1000, loss/actor=-289.932, loss/critic1=23.516, loss/critic2=23.229, n/ep=0, n/st=1, rew=3323.27]                          


Epoch #178: test_reward: 1321.313619 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #179: 1001it [00:14, 69.23it/s, env_step=179000, gradient_step=179000, len=946, loss/actor=-289.578, loss/critic1=13.857, loss/critic2=13.896, n/ep=0, n/st=1, rew=3107.41]                          


Epoch #179: test_reward: 3380.971592 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #180: 1001it [00:14, 67.78it/s, env_step=180000, gradient_step=180000, len=1000, loss/actor=-289.641, loss/critic1=22.624, loss/critic2=22.504, n/ep=0, n/st=1, rew=3345.07]                          


Epoch #180: test_reward: 3352.952050 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #181: 1001it [00:14, 68.46it/s, env_step=181000, gradient_step=181000, len=991, loss/actor=-289.946, loss/critic1=23.723, loss/critic2=21.951, n/ep=0, n/st=1, rew=3296.33]                          


Epoch #181: test_reward: 3343.352012 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #182: 1001it [00:14, 68.98it/s, env_step=182000, gradient_step=182000, len=1000, loss/actor=-289.532, loss/critic1=14.242, loss/critic2=16.803, n/ep=0, n/st=1, rew=3329.97]                          


Epoch #182: test_reward: 3407.993358 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #183: 1001it [00:14, 69.57it/s, env_step=183000, gradient_step=183000, len=1000, loss/actor=-290.197, loss/critic1=16.784, loss/critic2=14.830, n/ep=0, n/st=1, rew=3306.41]                          


Epoch #183: test_reward: 3132.594382 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #184: 1001it [00:14, 69.25it/s, env_step=184000, gradient_step=184000, len=1000, loss/actor=-289.623, loss/critic1=17.053, loss/critic2=17.509, n/ep=0, n/st=1, rew=3297.93]                          


Epoch #184: test_reward: 3388.841533 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #185: 1001it [00:14, 69.32it/s, env_step=185000, gradient_step=185000, len=1000, loss/actor=-290.089, loss/critic1=10.662, loss/critic2=9.537, n/ep=0, n/st=1, rew=3370.22]                          


Epoch #185: test_reward: 7.887975 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #186: 1001it [00:15, 64.78it/s, env_step=186000, gradient_step=186000, len=280, loss/actor=-289.973, loss/critic1=14.602, loss/critic2=15.291, n/ep=0, n/st=1, rew=974.59]                          


Epoch #186: test_reward: 3381.115416 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #187: 1001it [00:15, 62.62it/s, env_step=187000, gradient_step=187000, len=333, loss/actor=-289.738, loss/critic1=22.402, loss/critic2=22.569, n/ep=0, n/st=1, rew=1155.92]                          


Epoch #187: test_reward: 3370.604258 ± 0.000000, best_reward: 3427.857606 ± 0.000000 in #125


Epoch #188: 1001it [00:14, 69.10it/s, env_step=188000, gradient_step=188000, len=1000, loss/actor=-289.738, loss/critic1=22.671, loss/critic2=23.093, n/ep=0, n/st=1, rew=3333.34]                          


Epoch #188: test_reward: 3434.357724 ± 0.000000, best_reward: 3434.357724 ± 0.000000 in #188


Epoch #189: 1001it [00:14, 69.20it/s, env_step=189000, gradient_step=189000, len=1000, loss/actor=-290.232, loss/critic1=22.457, loss/critic2=22.868, n/ep=0, n/st=1, rew=3288.09]                          


Epoch #189: test_reward: 3339.291478 ± 0.000000, best_reward: 3434.357724 ± 0.000000 in #188


Epoch #190: 1001it [00:14, 68.33it/s, env_step=190000, gradient_step=190000, len=1000, loss/actor=-289.456, loss/critic1=16.739, loss/critic2=15.981, n/ep=0, n/st=1, rew=3282.45]                          


Epoch #190: test_reward: 3373.456556 ± 0.000000, best_reward: 3434.357724 ± 0.000000 in #188


Epoch #191: 1001it [00:14, 68.61it/s, env_step=191000, gradient_step=191000, len=1000, loss/actor=-289.694, loss/critic1=15.799, loss/critic2=13.520, n/ep=0, n/st=1, rew=3387.26]                          


Epoch #191: test_reward: 3297.390113 ± 0.000000, best_reward: 3434.357724 ± 0.000000 in #188


Epoch #192: 1001it [00:14, 69.22it/s, env_step=192000, gradient_step=192000, len=1000, loss/actor=-289.873, loss/critic1=20.959, loss/critic2=20.358, n/ep=0, n/st=1, rew=3322.75]                          


Epoch #192: test_reward: 3334.935184 ± 0.000000, best_reward: 3434.357724 ± 0.000000 in #188


Epoch #193: 1001it [00:14, 68.62it/s, env_step=193000, gradient_step=193000, len=1000, loss/actor=-289.748, loss/critic1=20.433, loss/critic2=20.676, n/ep=0, n/st=1, rew=3321.01]                          


Epoch #193: test_reward: 3386.409722 ± 0.000000, best_reward: 3434.357724 ± 0.000000 in #188


Epoch #194: 1001it [00:14, 69.09it/s, env_step=194000, gradient_step=194000, len=869, loss/actor=-290.162, loss/critic1=14.103, loss/critic2=13.114, n/ep=0, n/st=1, rew=2961.83]                          


Epoch #194: test_reward: 3274.227947 ± 0.000000, best_reward: 3434.357724 ± 0.000000 in #188


Epoch #195: 1001it [00:14, 68.54it/s, env_step=195000, gradient_step=195000, len=1000, loss/actor=-290.489, loss/critic1=12.415, loss/critic2=12.962, n/ep=0, n/st=1, rew=3284.81]                          


Epoch #195: test_reward: 3317.956038 ± 0.000000, best_reward: 3434.357724 ± 0.000000 in #188


Epoch #196: 1001it [00:14, 68.94it/s, env_step=196000, gradient_step=196000, len=1000, loss/actor=-290.276, loss/critic1=29.402, loss/critic2=26.613, n/ep=0, n/st=1, rew=3305.44]                          


Epoch #196: test_reward: 3400.815168 ± 0.000000, best_reward: 3434.357724 ± 0.000000 in #188


Epoch #197: 1001it [00:14, 68.51it/s, env_step=197000, gradient_step=197000, len=1000, loss/actor=-290.482, loss/critic1=17.869, loss/critic2=18.750, n/ep=0, n/st=1, rew=3425.30]                          


Epoch #197: test_reward: 3374.946529 ± 0.000000, best_reward: 3434.357724 ± 0.000000 in #188


Epoch #198: 1001it [00:14, 69.09it/s, env_step=198000, gradient_step=198000, len=510, loss/actor=-290.629, loss/critic1=10.223, loss/critic2=9.950, n/ep=0, n/st=1, rew=1772.99]                           


Epoch #198: test_reward: 3326.646583 ± 0.000000, best_reward: 3434.357724 ± 0.000000 in #188


Epoch #199: 1001it [00:14, 68.35it/s, env_step=199000, gradient_step=199000, len=254, loss/actor=-290.538, loss/critic1=16.891, loss/critic2=15.512, n/ep=0, n/st=1, rew=891.34]                          


Epoch #199: test_reward: 3390.424575 ± 0.000000, best_reward: 3434.357724 ± 0.000000 in #188


Epoch #200: 1001it [00:14, 68.58it/s, env_step=200000, gradient_step=200000, len=1000, loss/actor=-290.436, loss/critic1=16.887, loss/critic2=18.142, n/ep=0, n/st=1, rew=3396.41]                          


Epoch #200: test_reward: 3372.969074 ± 0.000000, best_reward: 3434.357724 ± 0.000000 in #188
