In [1]:
import os
import argparse
import datetime

import torch
from torch import nn
import numpy as np
import gymnasium as gym

from tianshou.data import Collector, Batch, to_torch
from tianshou.data.types import RolloutBatchProtocol
from tianshou.data.buffer.vecbuf import VectorReplayBuffer, ReplayBuffer
from tianshou.env import SubprocVectorEnv
from tianshou.policy import SACPolicy, BasePolicy
from tianshou.utils.net.common import Net
from tianshou.utils.net.continuous import ActorProb, Critic
from examples.offline.utils import load_buffer_d4rl
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger
from tianshou.trainer import OffpolicyTrainer

/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/glfw/__init__.py:916: GLFWError: (65544) b'X11: The DISPLAY environment variable is missing'
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/gymnasium/envs/registration.py", line 594, in load_plugin_envs
    fn()
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/shimmy/registration.py", line 262, in register_gymnasium_envs
    _register_dm_control_envs()
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/shimmy/registration.py", line 26, in _register_dm_control_envs
    from shimmy.dm_control_compatibility import DmControlCompatibilityV0
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/shimmy/dm_control_compatibility.py", line 12, in <module>
    import dm_env
ModuleNotFoundError: No module named 'dm_env'
[0m
  logger.warn(f"plugin: {plugin.value} raised {traceback.format_exc()}")
No module named 'mjrl'
No module named 'flow'
No modu

In [2]:
device = "cuda:3"

In [3]:
def get_sac_args():
    args = argparse.Namespace(
        task="Hopper-v2",
        buffer_size=1000000,
        hidden_sizes=[256, 256, 256],
        actor_lr=1e-4,
        critic_lr=3e-4,
        gamma=0.99,
        tau=0.005,
        alpha_lr=1e-4,
        alpha=0.36,
        start_timesteps=1,
        epoch=200,
        step_per_epoch=5000,
        step_per_collect=1,
        update_per_step=1,
        batch_size=256,
        training_num=1,
        test_num=10,
        device=device,
        norm_layer=True,
    )
    return args

In [4]:
args = get_sac_args()
env = gym.make(args.task)

  logger.deprecation(
  logger.deprecation(


In [5]:
def load_policy(path):
    args = get_sac_args()
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # model
    net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device)
    actor = ActorProb(
        net_a,
        args.action_shape,
        device=args.device,
        unbounded=True,
        conditioned_sigma=True,
    ).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    net_c1 = Net(
        args.state_shape,
        args.action_shape,
        hidden_sizes=args.hidden_sizes,
        concat=True,
        device=args.device,
        norm_layer=nn.LayerNorm if args.norm_layer else None
    )
    net_c2 = Net(
        args.state_shape,
        args.action_shape,
        hidden_sizes=args.hidden_sizes,
        concat=True,
        device=args.device,
        norm_layer=nn.LayerNorm if args.norm_layer else None
    )
    critic1 = Critic(net_c1, device=args.device).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    critic2 = Critic(net_c2, device=args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
    target_entropy = -np.prod(env.action_space.shape)
    log_alpha = torch.tensor([np.log(args.alpha)], requires_grad=True, device=device)
    alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr)
    alpha = (target_entropy, log_alpha, alpha_optim)
    policy = SACPolicy(
        actor,
        actor_optim,
        critic1,
        critic1_optim,
        critic2,
        critic2_optim,
        tau=args.tau,
        gamma=args.gamma,
        alpha=alpha,
        action_space=env.action_space,
    )
    policy.load_state_dict(torch.load(path, map_location=args.device))
    print("Loaded agent from: ", path)
    return policy

In [19]:
cql_policy = load_policy("/data/user/R901105/dev/log/Hopper-v2/cql/0/231220-111219/policy.pth")

Loaded agent from:  /data/user/R901105/dev/log/Hopper-v2/cql/0/231220-111219/policy.pth


  logger.deprecation(
  logger.deprecation(


In [7]:
sac_policy = load_policy("/data/user/R901105/dev/log/Hopper-v2/sac/0/231219-163624/policy.pth")

Loaded agent from:  /data/user/R901105/dev/log/Hopper-v2/sac/0/231219-163624/policy.pth


  logger.deprecation(
  logger.deprecation(


In [8]:
envs = SubprocVectorEnv([lambda: gym.make(args.task) for _ in range(10)])

In [9]:
cql_policy.eval()
cql_collector = Collector(cql_policy, envs)
cql_result = cql_collector.collect(n_episode=40)
cql_result

{'n/ep': 40,
 'n/st': 26003,
 'rews': array([1486.25269722, 1823.26179735, 1815.41471127, 1823.64823606,
        2087.71584701, 2396.16515738, 2461.0877074 , 2686.45698862,
        2770.80036959, 3384.23060393, 1842.48922955, 1866.94245461,
        1898.13672459, 1807.75406455, 1417.38894988, 1665.01841421,
        2181.79117767, 1902.67316167, 3142.5463871 , 1484.83423753,
        2180.77193707, 2174.34573781, 1804.64465422, 2182.29790974,
        1899.60368569, 2745.06289326, 2078.45634837, 2320.67582828,
        1856.80881647, 2773.04795628, 2183.64806295, 2268.10985287,
        1907.08551018, 1816.56290292, 1843.81815116, 2998.18710922,
        1994.10289773, 2978.34000772, 2359.09940191, 2161.83989175]),
 'lens': array([455, 546, 551, 551, 633, 717, 745, 798, 829, 998, 551, 562, 575,
        546, 436, 505, 655, 575, 938, 454, 657, 655, 548, 656, 575, 822,
        619, 696, 556, 833, 659, 683, 574, 551, 557, 891, 600, 890, 711,
        650]),
 'idxs': array([1, 7, 4, 6, 2, 3, 0, 8,

In [10]:
sac_policy.eval()
sac_collector = Collector(sac_policy, envs)
sac_result = sac_collector.collect(n_episode=40)
sac_result

{'n/ep': 40,
 'n/st': 38521,
 'rews': array([1650.7395058 , 2593.72564536, 3602.29768638, 3553.53303002,
        3580.08063385, 3579.30580244, 3520.1980104 , 3553.8322047 ,
        3550.06542435, 3569.61099467, 3607.30669895, 3578.27491201,
        3528.71862059, 3586.13532283, 3559.06398147, 3580.10912595,
        3571.73881482, 3570.89898834, 3521.56364915, 3546.21762465,
        3533.51493836, 2312.71887543, 3524.73381178, 3234.52668798,
        3129.11860956, 3535.58325009, 3591.21120569, 3570.74677282,
        3568.84516274, 3556.83502317, 3581.03732314, 3564.99177602,
        3575.07730797, 3533.45569568, 3550.81724913, 3535.46933535,
        3562.03007878, 3568.32463834, 3497.97434734, 3642.65175359]),
 'lens': array([ 478,  702, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
        1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,  641,
         962,  861,  877, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
        1000, 1000, 1000, 1000, 1000, 1000, 1000]),

In [81]:
class MixedPolicy(BasePolicy):
    def __init__(self, base_policy: BasePolicy, expert_policy: BasePolicy, action_space, buffer):
        super().__init__(action_space=action_space, action_scaling=True)
        self.base_policy = base_policy
        self.expert_policy = expert_policy
        self.device = device
        self.buffer = buffer

    def forward(self, batch: RolloutBatchProtocol, state=None, **kwargs):
        bsz = len(batch.obs)
        batch = to_torch(batch, dtype=torch.float32, device=self.device)
        self.base_policy.eval()
        self.expert_policy.eval()
        with torch.no_grad():
            expert_result = self.expert_policy(batch)
            base_result = self.base_policy(batch)
            expert_qvalues1 = self.expert_policy.critic1(batch.obs, expert_result.act)
            expert_qvalues2 = self.expert_policy.critic2(batch.obs, expert_result.act)
            expert_qvalues = torch.minimum(expert_qvalues1, expert_qvalues2)
            base_qvalues1 = self.base_policy.critic1(batch.obs, base_result.act)
            base_qvalues2 = self.base_policy.critic2(batch.obs, base_result.act)
            base_qvalues = torch.minimum(base_qvalues1, base_qvalues2)
        cede_ctrl = base_qvalues < expert_qvalues
        actions = torch.where(cede_ctrl, expert_result.act, base_result.act)
        return Batch(**{'act': actions, 'policy': Batch({'cede_ctrl': cede_ctrl})})

    def train(self, mode: bool = True) -> "MixedPolicy":
        self.base_policy.train(mode)
        return self
    
    def process_fn(self, batch: RolloutBatchProtocol, buffer: ReplayBuffer, indices: np.ndarray) -> RolloutBatchProtocol:
        return self.base_policy.process_fn(batch, buffer, indices)

    def learn(self, batch, **kwargs):
        # cede_ctrl = batch.policy.cede_ctrl.cpu().squeeze()
        # train_batch = batch[~cede_ctrl]
        # info = self.base_policy.learn(train_batch)
        info = self.base_policy.learn(batch)
        return info

In [11]:
test_buffer = VectorReplayBuffer(5000, 5)

In [82]:
mixed_policy = MixedPolicy(cql_policy, sac_policy, env.action_space, test_buffer)

In [12]:
mixed_collector = Collector(mixed_policy, envs)
mixed_result = mixed_collector.collect(n_episode=40)
mixed_result

{'n/ep': 40,
 'n/st': 39442,
 'rews': array([2877.85420345, 3579.38976417, 3566.2887685 , 3612.26257751,
        3648.73333248, 3655.45149677, 3559.89113674, 3552.6419363 ,
        3588.0665408 , 3578.10143437, 3576.98877615, 3561.46943533,
        3558.70658515, 3598.58161738, 3558.16991895, 3525.51417879,
        3579.98865515, 3558.74737203, 3518.58784384, 3641.33416581,
        3563.76800552, 3231.92112228, 3558.71492095, 3535.73037386,
        3558.96929922, 3583.61728139, 3556.20513785, 3542.30987626,
        3584.5683532 , 3573.77252436, 2903.35971768, 3574.50541953,
        3551.28549583, 3540.74278544, 3513.75433021, 3546.94601085,
        3569.82593565, 3520.37927402, 3546.47417846, 3562.03266267]),
 'lens': array([ 798, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
        1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,  869,
        1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,  775, 1000, 1000,
        1000, 1000, 1000, 1000, 1000, 1000, 1000]),

In [59]:
class DeterministicPolicy(BasePolicy):
    def __init__(self, policy, action_space):
        super().__init__(action_space=action_space, action_scaling=True)
        self.policy = policy

    def train(self, mode: bool = True) -> "DeterministicPolicy":
        self.policy.train(mode)
        return self
    
    def forward(self, batch, state=None, **kwargs):
        self.policy.eval()
        return self.policy(batch)
    
    def learn(self, batch, **kwargs):
        info = self.policy.learn(batch)
        return info
    
    def process_fn(self, batch: RolloutBatchProtocol, buffer: ReplayBuffer, indices: np.ndarray) -> RolloutBatchProtocol:
        return self.policy.process_fn(batch, buffer, indices)

In [60]:
det_cql = DeterministicPolicy(cql_policy, env.action_space)

In [None]:
det_cql_collector = Collector(cql_policy, envs)
cql_result = cql_collector.collect(n_episode=40)
cql_result

In [13]:
test_envs = SubprocVectorEnv([lambda: gym.make(args.task) for _ in range(5)])

  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(


In [18]:
offline_data = load_buffer_d4rl("hopper-medium-v2")

  logger.warn(
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
load datafile:   5%|▍         | 1/21 [00:00<00:02,  9.40it/s]

load datafile: 100%|██████████| 21/21 [00:01<00:00, 13.28it/s]


In [20]:
policy = cql_policy

In [15]:
test_buffer = VectorReplayBuffer(5000, 5)
train_collector = Collector(policy, env, offline_data)
test_collector = Collector(policy, test_envs, test_buffer)



In [21]:
# log
now = datetime.datetime.now().strftime("%y%m%d-%H%M%S")
log_name = os.path.join(args.task, policy.__class__.__name__, now)
log_path = os.path.join("/data/user/R901105/dev/log", log_name)
writer = SummaryWriter(log_path)
logger = TensorboardLogger(writer)
print(log_path)

/data/user/R901105/dev/log/Hopper-v2/SACPolicy/240423-173106


In [15]:
def get_returns(policy, return_cede_ctrl=False):
    policy.eval()
    returns = []
    cede_ctrl = []
    for _ in range(5):
        obs, info = env.reset()
        done = False
        cum_reward = 0
        actions = []
        if return_cede_ctrl:
            cctrl_ep = []
        while not done:
            batch = Batch(obs=torch.from_numpy(np.expand_dims(obs, 0)).to(device), info=info)
            with torch.no_grad():
                result = policy(batch)
            act = result.act.cpu().squeeze().numpy()
            act = policy.map_action(act)
            if return_cede_ctrl:
                cctrl_ep.append(result.policy.cede_ctrl.cpu().squeeze().numpy())
            obs, reward, terminated, truncated, info = env.step(act)
            actions.append(act)
            cum_reward += reward
            done = terminated or truncated
        returns.append(cum_reward)
        if return_cede_ctrl:
            cede_ctrl.append(np.array(cctrl_ep).mean())
    if return_cede_ctrl:
        return np.array(returns),  np.array(cede_ctrl).mean()
    return np.array(returns), _

In [16]:
def test_fn(num_epoch: int, step_idx: int):
    returns, cede_ctrl = get_returns(policy, True)
    print(returns.mean(), cede_ctrl.mean())

In [22]:
result = OffpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    buffer=offline_data,
    # test_fn=test_fn,
    max_epoch=200,
    step_per_epoch=1000,
    step_per_collect=1,
    episode_per_test=1,
    batch_size=256,
    logger=logger,
    update_per_step=1,
    test_in_train=False,
).run()

Epoch #1: 1001it [00:15, 64.63it/s, alpha=0.323, env_step=1000, gradient_step=1000, len=600, loss/actor=-265.235, loss/alpha=-1.450, loss/critic1=10.468, loss/critic2=10.492, n/ep=0, n/st=1, rew=2024.22]                          


Epoch #1: test_reward: 3150.884889 ± 0.000000, best_reward: 3150.884889 ± 0.000000 in #1


Epoch #2: 1001it [00:16, 60.38it/s, alpha=0.290, env_step=2000, gradient_step=2000, len=446, loss/actor=-263.216, loss/alpha=-1.732, loss/critic1=9.273, loss/critic2=9.195, n/ep=0, n/st=1, rew=1515.25]                           


Epoch #2: test_reward: 3051.355611 ± 0.000000, best_reward: 3150.884889 ± 0.000000 in #1


Epoch #3: 1001it [00:16, 60.82it/s, alpha=0.261, env_step=3000, gradient_step=3000, len=385, loss/actor=-261.482, loss/alpha=-1.961, loss/critic1=6.918, loss/critic2=7.410, n/ep=0, n/st=1, rew=1276.27]                           


Epoch #3: test_reward: 1518.441726 ± 0.000000, best_reward: 3150.884889 ± 0.000000 in #1


Epoch #4: 1001it [00:17, 58.05it/s, alpha=0.235, env_step=4000, gradient_step=4000, len=775, loss/actor=-261.592, loss/alpha=-2.190, loss/critic1=3.678, loss/critic2=3.964, n/ep=0, n/st=1, rew=2486.17]                            


Epoch #4: test_reward: 1499.868879 ± 0.000000, best_reward: 3150.884889 ± 0.000000 in #1


Epoch #5: 1001it [00:16, 59.37it/s, alpha=0.213, env_step=5000, gradient_step=5000, len=387, loss/actor=-260.548, loss/alpha=-2.274, loss/critic1=5.278, loss/critic2=5.245, n/ep=0, n/st=1, rew=1285.96]                           


Epoch #5: test_reward: 2326.359527 ± 0.000000, best_reward: 3150.884889 ± 0.000000 in #1


Epoch #6: 1001it [00:17, 58.79it/s, alpha=0.193, env_step=6000, gradient_step=6000, len=681, loss/actor=-260.769, loss/alpha=-2.299, loss/critic1=21.613, loss/critic2=21.667, n/ep=0, n/st=1, rew=2302.47]                          


Epoch #6: test_reward: 2495.793671 ± 0.000000, best_reward: 3150.884889 ± 0.000000 in #1


Epoch #7: 1001it [00:16, 62.05it/s, alpha=0.175, env_step=7000, gradient_step=7000, len=446, loss/actor=-260.838, loss/alpha=-2.431, loss/critic1=5.823, loss/critic2=5.704, n/ep=0, n/st=1, rew=1532.01]                          


Epoch #7: test_reward: 2546.746616 ± 0.000000, best_reward: 3150.884889 ± 0.000000 in #1


Epoch #8: 1001it [00:16, 61.03it/s, alpha=0.159, env_step=8000, gradient_step=8000, len=618, loss/actor=-260.367, loss/alpha=-2.329, loss/critic1=3.565, loss/critic2=3.313, n/ep=0, n/st=1, rew=2021.87]                           


Epoch #8: test_reward: 1734.818212 ± 0.000000, best_reward: 3150.884889 ± 0.000000 in #1


Epoch #9: 1001it [00:16, 62.16it/s, alpha=0.145, env_step=9000, gradient_step=9000, len=530, loss/actor=-260.913, loss/alpha=-2.214, loss/critic1=5.988, loss/critic2=5.375, n/ep=0, n/st=1, rew=1816.44]                          


Epoch #9: test_reward: 2299.305655 ± 0.000000, best_reward: 3150.884889 ± 0.000000 in #1


Epoch #10: 1001it [00:16, 61.41it/s, alpha=0.132, env_step=10000, gradient_step=10000, len=997, loss/actor=-262.608, loss/alpha=-2.144, loss/critic1=5.283, loss/critic2=5.294, n/ep=0, n/st=1, rew=3259.78]                          


Epoch #10: test_reward: 1161.159407 ± 0.000000, best_reward: 3150.884889 ± 0.000000 in #1


Epoch #11: 1001it [00:16, 61.40it/s, alpha=0.120, env_step=11000, gradient_step=11000, len=768, loss/actor=-263.538, loss/alpha=-2.344, loss/critic1=8.242, loss/critic2=8.235, n/ep=0, n/st=1, rew=2524.81]                          


Epoch #11: test_reward: 1806.836718 ± 0.000000, best_reward: 3150.884889 ± 0.000000 in #1


Epoch #12: 1001it [00:15, 63.55it/s, alpha=0.109, env_step=12000, gradient_step=12000, len=1000, loss/actor=-263.720, loss/alpha=-2.589, loss/critic1=5.388, loss/critic2=5.822, n/ep=0, n/st=1, rew=3220.69]                           


Epoch #12: test_reward: 1446.258235 ± 0.000000, best_reward: 3150.884889 ± 0.000000 in #1


Epoch #13: 1001it [00:16, 60.22it/s, alpha=0.099, env_step=13000, gradient_step=13000, len=1000, loss/actor=-265.382, loss/alpha=-2.351, loss/critic1=5.158, loss/critic2=5.312, n/ep=0, n/st=1, rew=3149.43]                           


Epoch #13: test_reward: 3276.466343 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #14: 1001it [00:15, 63.11it/s, alpha=0.090, env_step=14000, gradient_step=14000, len=410, loss/actor=-264.827, loss/alpha=-2.530, loss/critic1=1.879, loss/critic2=1.895, n/ep=0, n/st=1, rew=1358.36]                          


Epoch #14: test_reward: 3140.961959 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #15: 1001it [00:15, 62.66it/s, alpha=0.082, env_step=15000, gradient_step=15000, len=361, loss/actor=-266.699, loss/alpha=-2.645, loss/critic1=9.413, loss/critic2=9.524, n/ep=0, n/st=1, rew=1182.79]                          


Epoch #15: test_reward: 3100.963179 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #16: 1001it [00:16, 61.56it/s, alpha=0.075, env_step=16000, gradient_step=16000, len=1000, loss/actor=-267.484, loss/alpha=-1.935, loss/critic1=1.520, loss/critic2=1.470, n/ep=0, n/st=1, rew=3135.93]                          


Epoch #16: test_reward: 1539.631426 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #17: 1001it [00:16, 59.73it/s, alpha=0.069, env_step=17000, gradient_step=17000, len=1000, loss/actor=-269.032, loss/alpha=-2.282, loss/critic1=5.119, loss/critic2=5.255, n/ep=0, n/st=1, rew=3147.25]                           


Epoch #17: test_reward: 2739.514495 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #18: 1001it [00:16, 59.78it/s, alpha=0.062, env_step=18000, gradient_step=18000, len=1000, loss/actor=-268.904, loss/alpha=-2.583, loss/critic1=11.680, loss/critic2=11.497, n/ep=0, n/st=1, rew=3182.67]                          


Epoch #18: test_reward: 1220.447837 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #19: 1001it [00:16, 60.23it/s, alpha=0.057, env_step=19000, gradient_step=19000, len=930, loss/actor=-269.363, loss/alpha=-1.911, loss/critic1=12.477, loss/critic2=12.331, n/ep=0, n/st=1, rew=3030.99]                          


Epoch #19: test_reward: 1277.539697 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #20: 1001it [00:15, 63.06it/s, alpha=0.052, env_step=20000, gradient_step=20000, len=588, loss/actor=-270.639, loss/alpha=-1.752, loss/critic1=3.419, loss/critic2=3.401, n/ep=0, n/st=1, rew=1941.56]                           


Epoch #20: test_reward: 1833.551187 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #21: 1001it [00:15, 64.38it/s, alpha=0.049, env_step=21000, gradient_step=21000, len=831, loss/actor=-270.939, loss/alpha=-1.299, loss/critic1=1.417, loss/critic2=1.309, n/ep=0, n/st=1, rew=2706.63]                           


Epoch #21: test_reward: 2602.023109 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #22: 1001it [00:15, 66.55it/s, alpha=0.046, env_step=22000, gradient_step=22000, len=919, loss/actor=-272.129, loss/alpha=-1.268, loss/critic1=4.182, loss/critic2=4.167, n/ep=0, n/st=1, rew=2965.25]                          


Epoch #22: test_reward: 2230.951899 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #23: 1001it [00:15, 66.32it/s, alpha=0.043, env_step=23000, gradient_step=23000, len=864, loss/actor=-273.992, loss/alpha=-0.781, loss/critic1=2.044, loss/critic2=2.173, n/ep=0, n/st=1, rew=2821.43]                           


Epoch #23: test_reward: 1220.884828 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #24: 1001it [00:15, 63.65it/s, alpha=0.041, env_step=24000, gradient_step=24000, len=461, loss/actor=-274.127, loss/alpha=-1.015, loss/critic1=8.767, loss/critic2=8.626, n/ep=0, n/st=1, rew=1554.69]                           


Epoch #24: test_reward: 1607.255830 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #25: 1001it [00:15, 63.18it/s, alpha=0.039, env_step=25000, gradient_step=25000, len=463, loss/actor=-275.253, loss/alpha=-0.309, loss/critic1=7.653, loss/critic2=7.392, n/ep=0, n/st=1, rew=1541.84]                            


Epoch #25: test_reward: 2076.696815 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #26: 1001it [00:15, 65.50it/s, alpha=0.040, env_step=26000, gradient_step=26000, len=550, loss/actor=-276.063, loss/alpha=-0.181, loss/critic1=1.629, loss/critic2=1.674, n/ep=0, n/st=1, rew=1798.26]                          


Epoch #26: test_reward: 3138.571473 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #27: 1001it [00:15, 65.43it/s, alpha=0.040, env_step=27000, gradient_step=27000, len=347, loss/actor=-278.404, loss/alpha=-0.227, loss/critic1=5.389, loss/critic2=5.553, n/ep=0, n/st=1, rew=1125.99]                           


Epoch #27: test_reward: 1900.467066 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #28: 1001it [00:16, 61.36it/s, alpha=0.039, env_step=28000, gradient_step=28000, len=459, loss/actor=-277.933, loss/alpha=0.589, loss/critic1=9.420, loss/critic2=9.604, n/ep=0, n/st=1, rew=1477.49]                             


Epoch #28: test_reward: 2538.168791 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #29: 1001it [00:15, 63.04it/s, alpha=0.040, env_step=29000, gradient_step=29000, len=776, loss/actor=-279.163, loss/alpha=0.595, loss/critic1=8.454, loss/critic2=8.649, n/ep=0, n/st=1, rew=2541.16]                            


Epoch #29: test_reward: 2287.907156 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #30: 1001it [00:15, 64.54it/s, alpha=0.040, env_step=30000, gradient_step=30000, len=381, loss/actor=-280.233, loss/alpha=-0.310, loss/critic1=4.483, loss/critic2=3.924, n/ep=0, n/st=1, rew=1238.03]                           


Epoch #30: test_reward: 3142.463294 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #31: 1001it [00:15, 65.34it/s, alpha=0.039, env_step=31000, gradient_step=31000, len=432, loss/actor=-281.350, loss/alpha=0.027, loss/critic1=1.762, loss/critic2=1.914, n/ep=0, n/st=1, rew=1453.10]                            


Epoch #31: test_reward: 1744.012036 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #32: 1001it [00:15, 64.66it/s, alpha=0.039, env_step=32000, gradient_step=32000, len=993, loss/actor=-283.001, loss/alpha=-0.431, loss/critic1=1.418, loss/critic2=1.449, n/ep=0, n/st=1, rew=3345.54]                           


Epoch #32: test_reward: 2586.593740 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #33: 1001it [00:15, 64.17it/s, alpha=0.037, env_step=33000, gradient_step=33000, len=376, loss/actor=-284.346, loss/alpha=0.059, loss/critic1=4.632, loss/critic2=4.319, n/ep=0, n/st=1, rew=1236.57]                            


Epoch #33: test_reward: 3170.288325 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #34: 1001it [00:15, 64.70it/s, alpha=0.036, env_step=34000, gradient_step=34000, len=595, loss/actor=-285.292, loss/alpha=-0.328, loss/critic1=5.017, loss/critic2=4.999, n/ep=0, n/st=1, rew=1979.82]                          


Epoch #34: test_reward: 1751.466476 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #35: 1001it [00:15, 66.32it/s, alpha=0.035, env_step=35000, gradient_step=35000, len=548, loss/actor=-286.268, loss/alpha=0.137, loss/critic1=3.776, loss/critic2=3.723, n/ep=0, n/st=1, rew=1811.49]                            


Epoch #35: test_reward: 1718.780809 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #36: 1001it [00:15, 66.34it/s, alpha=0.034, env_step=36000, gradient_step=36000, len=909, loss/actor=-287.211, loss/alpha=-0.734, loss/critic1=2.293, loss/critic2=2.090, n/ep=0, n/st=1, rew=2933.61]                           


Epoch #36: test_reward: 3174.878485 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #37: 1001it [00:14, 66.81it/s, alpha=0.033, env_step=37000, gradient_step=37000, len=789, loss/actor=-287.222, loss/alpha=-0.174, loss/critic1=7.086, loss/critic2=7.481, n/ep=0, n/st=1, rew=2632.12]                           


Epoch #37: test_reward: 2871.473399 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #38: 1001it [00:14, 66.99it/s, alpha=0.035, env_step=38000, gradient_step=38000, len=1000, loss/actor=-288.902, loss/alpha=0.227, loss/critic1=11.797, loss/critic2=12.088, n/ep=0, n/st=1, rew=3184.52]                          


Epoch #38: test_reward: 2772.557676 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #39: 1001it [00:14, 66.74it/s, alpha=0.035, env_step=39000, gradient_step=39000, len=1000, loss/actor=-289.987, loss/alpha=-0.130, loss/critic1=2.567, loss/critic2=2.332, n/ep=0, n/st=1, rew=3266.34]                          


Epoch #39: test_reward: 1740.056198 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #40: 1001it [00:15, 66.72it/s, alpha=0.035, env_step=40000, gradient_step=40000, len=1000, loss/actor=-290.705, loss/alpha=-0.285, loss/critic1=8.695, loss/critic2=8.970, n/ep=0, n/st=1, rew=3210.67]                           


Epoch #40: test_reward: 1157.252634 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #41: 1001it [00:15, 66.14it/s, alpha=0.035, env_step=41000, gradient_step=41000, len=455, loss/actor=-290.301, loss/alpha=0.194, loss/critic1=8.675, loss/critic2=8.738, n/ep=0, n/st=1, rew=1480.87]                             


Epoch #41: test_reward: 3163.215912 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #42: 1001it [00:16, 62.11it/s, alpha=0.035, env_step=42000, gradient_step=42000, len=761, loss/actor=-292.333, loss/alpha=-0.522, loss/critic1=7.311, loss/critic2=7.572, n/ep=0, n/st=1, rew=2546.56]                           


Epoch #42: test_reward: 1485.533674 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #43: 1001it [00:16, 62.52it/s, alpha=0.034, env_step=43000, gradient_step=43000, len=1000, loss/actor=-293.330, loss/alpha=-0.081, loss/critic1=10.400, loss/critic2=10.616, n/ep=0, n/st=1, rew=3149.38]                          


Epoch #43: test_reward: 1184.983791 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #44: 1001it [00:15, 65.46it/s, alpha=0.034, env_step=44000, gradient_step=44000, len=858, loss/actor=-294.451, loss/alpha=-0.810, loss/critic1=16.842, loss/critic2=16.938, n/ep=0, n/st=1, rew=2841.65]                          


Epoch #44: test_reward: 843.006448 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #45: 1001it [00:15, 64.59it/s, alpha=0.033, env_step=45000, gradient_step=45000, len=538, loss/actor=-295.028, loss/alpha=0.227, loss/critic1=6.220, loss/critic2=6.106, n/ep=0, n/st=1, rew=1765.82]                           


Epoch #45: test_reward: 1629.954542 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #46: 1001it [00:16, 61.16it/s, alpha=0.032, env_step=46000, gradient_step=46000, len=778, loss/actor=-295.680, loss/alpha=0.245, loss/critic1=10.817, loss/critic2=10.645, n/ep=0, n/st=1, rew=2567.66]                          


Epoch #46: test_reward: 1742.104824 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #47: 1001it [00:15, 63.99it/s, alpha=0.033, env_step=47000, gradient_step=47000, len=442, loss/actor=-295.959, loss/alpha=0.313, loss/critic1=1.677, loss/critic2=1.853, n/ep=0, n/st=1, rew=1422.36]                            


Epoch #47: test_reward: 2825.639337 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #48: 1001it [00:15, 63.45it/s, alpha=0.035, env_step=48000, gradient_step=48000, len=616, loss/actor=-297.297, loss/alpha=0.359, loss/critic1=8.929, loss/critic2=9.056, n/ep=0, n/st=1, rew=2052.56]                           


Epoch #48: test_reward: 1410.756811 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #49: 1001it [00:15, 65.19it/s, alpha=0.037, env_step=49000, gradient_step=49000, len=621, loss/actor=-298.506, loss/alpha=0.618, loss/critic1=7.272, loss/critic2=7.432, n/ep=0, n/st=1, rew=2079.96]                           


Epoch #49: test_reward: 1217.864838 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #50: 1001it [00:15, 65.43it/s, alpha=0.037, env_step=50000, gradient_step=50000, len=458, loss/actor=-299.394, loss/alpha=-0.546, loss/critic1=7.359, loss/critic2=7.405, n/ep=0, n/st=1, rew=1470.23]                           


Epoch #50: test_reward: 1256.330944 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #51: 1001it [00:15, 65.30it/s, alpha=0.036, env_step=51000, gradient_step=51000, len=608, loss/actor=-301.301, loss/alpha=-0.172, loss/critic1=5.940, loss/critic2=5.773, n/ep=0, n/st=1, rew=2045.17]                           


Epoch #51: test_reward: 1739.912807 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #52: 1001it [00:15, 64.94it/s, alpha=0.036, env_step=52000, gradient_step=52000, len=383, loss/actor=-302.499, loss/alpha=-0.137, loss/critic1=5.886, loss/critic2=6.323, n/ep=0, n/st=1, rew=1247.30]                           


Epoch #52: test_reward: 3255.556129 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #53: 1001it [00:15, 63.81it/s, alpha=0.035, env_step=53000, gradient_step=53000, len=379, loss/actor=-303.822, loss/alpha=-0.721, loss/critic1=12.893, loss/critic2=12.607, n/ep=0, n/st=1, rew=1243.48]                          


Epoch #53: test_reward: 2444.425899 ± 0.000000, best_reward: 3276.466343 ± 0.000000 in #13


Epoch #54:  51%|#####1    | 514/1000 [00:08<00:08, 58.48it/s, alpha=0.034, env_step=53513, gradient_step=53513, len=1000, loss/actor=-303.479, loss/alpha=-0.610, loss/critic1=2.223, loss/critic2=2.781, n/ep=0, n/st=1, rew=3216.97] 


KeyboardInterrupt: 