In [1]:
import os
import argparse
import datetime

import torch
from torch import nn
import numpy as np
import gymnasium as gym

from tianshou.data import Collector, Batch, to_torch
from tianshou.data.types import RolloutBatchProtocol
from tianshou.data.buffer.vecbuf import VectorReplayBuffer, ReplayBuffer
from tianshou.env import SubprocVectorEnv
from tianshou.policy import SACPolicy, BasePolicy
from tianshou.utils.net.common import Net
from tianshou.utils.net.continuous import ActorProb, Critic
from examples.offline.utils import load_buffer_d4rl
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger
from tianshou.trainer import OffpolicyTrainer

/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/glfw/__init__.py:916: GLFWError: (65544) b'X11: The DISPLAY environment variable is missing'
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/gymnasium/envs/registration.py", line 594, in load_plugin_envs
    fn()
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/shimmy/registration.py", line 262, in register_gymnasium_envs
    _register_dm_control_envs()
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/shimmy/registration.py", line 26, in _register_dm_control_envs
    from shimmy.dm_control_compatibility import DmControlCompatibilityV0
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/shimmy/dm_control_compatibility.py", line 12, in <module>
    import dm_env
ModuleNotFoundError: No module named 'dm_env'
[0m
  logger.warn(f"plugin: {plugin.value} raised {traceback.format_exc()}")
No module named 'mjrl'
No module named 'flow'
No modu

In [2]:
device = "cuda:3"

In [3]:
gym.envs.registry

{'CartPole-v0': EnvSpec(id='CartPole-v0', entry_point='gymnasium.envs.classic_control.cartpole:CartPoleEnv', reward_threshold=195.0, nondeterministic=False, max_episode_steps=200, order_enforce=True, autoreset=False, disable_env_checker=False, apply_api_compatibility=False, kwargs={}, namespace=None, name='CartPole', version=0, additional_wrappers=(), vector_entry_point='gymnasium.envs.classic_control.cartpole:CartPoleVectorEnv'),
 'CartPole-v1': EnvSpec(id='CartPole-v1', entry_point='gymnasium.envs.classic_control.cartpole:CartPoleEnv', reward_threshold=475.0, nondeterministic=False, max_episode_steps=500, order_enforce=True, autoreset=False, disable_env_checker=False, apply_api_compatibility=False, kwargs={}, namespace=None, name='CartPole', version=1, additional_wrappers=(), vector_entry_point='gymnasium.envs.classic_control.cartpole:CartPoleVectorEnv'),
 'MountainCar-v0': EnvSpec(id='MountainCar-v0', entry_point='gymnasium.envs.classic_control.mountain_car:MountainCarEnv', reward_t

In [3]:
def get_sac_args():
    args = argparse.Namespace(
        task="Hopper-v2",
        buffer_size=1000000,
        hidden_sizes=[256, 256, 256],
        actor_lr=1e-4,
        critic_lr=3e-4,
        gamma=0.99,
        tau=0.005,
        alpha_lr=1e-4,
        alpha=0.36,
        start_timesteps=1,
        epoch=200,
        step_per_epoch=5000,
        step_per_collect=1,
        update_per_step=1,
        batch_size=256,
        training_num=1,
        test_num=10,
        device=device,
        norm_layer=True,
    )
    return args

In [4]:
args = get_sac_args()
env = gym.make(args.task)

  logger.deprecation(
  logger.deprecation(


In [5]:
def load_policy(path):
    args = get_sac_args()
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # model
    net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device)
    actor = ActorProb(
        net_a,
        args.action_shape,
        device=args.device,
        unbounded=True,
        conditioned_sigma=True,
    ).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    net_c1 = Net(
        args.state_shape,
        args.action_shape,
        hidden_sizes=args.hidden_sizes,
        concat=True,
        device=args.device,
        norm_layer=nn.LayerNorm if args.norm_layer else None
    )
    net_c2 = Net(
        args.state_shape,
        args.action_shape,
        hidden_sizes=args.hidden_sizes,
        concat=True,
        device=args.device,
        norm_layer=nn.LayerNorm if args.norm_layer else None
    )
    critic1 = Critic(net_c1, device=args.device).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    critic2 = Critic(net_c2, device=args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
    target_entropy = -np.prod(env.action_space.shape)
    log_alpha = torch.tensor([np.log(args.alpha)], requires_grad=True, device=device)
    alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr)
    alpha = (target_entropy, log_alpha, alpha_optim)
    policy = SACPolicy(
        actor,
        actor_optim,
        critic1,
        critic1_optim,
        critic2,
        critic2_optim,
        tau=args.tau,
        gamma=args.gamma,
        alpha=alpha,
        action_space=env.action_space,
    )
    policy.load_state_dict(torch.load(path, map_location=args.device))
    print("Loaded agent from: ", path)
    return policy

In [58]:
cql_policy = load_policy("/data/user/R901105/dev/log/Hopper-v2/cql/0/231220-111219/policy.pth")

Loaded agent from:  /data/user/R901105/dev/log/Hopper-v2/cql/0/231220-111219/policy.pth


In [6]:
sac_policy = load_policy("/data/user/R901105/dev/log/Hopper-v2/sac/0/231219-163624/policy.pth")

Loaded agent from:  /data/user/R901105/dev/log/Hopper-v2/sac/0/231219-163624/policy.pth


In [82]:
from tianshou.env.gym_wrappers import ContinuousToDiscrete
num_actions = 10

env = SubprocVectorEnv([lambda: ContinuousToDiscrete(gym.make(args.task), num_actions) for _ in range(20)])

  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(
  logger.deprecation(


In [83]:
def reverse_action(act: np.ndarray) -> np.ndarray:
    _env = gym.make(args.task)
    low, high = _env.action_space.low, _env.action_space.high
    action_per_dim = [num_actions] * _env.action_space.shape[0]
    mesh = np.array(
        [np.linspace(lo, hi, a) for lo, hi, a in zip(low, high, action_per_dim)],
        dtype=object
    )
    # If act is a PyTorch tensor, convert it to a numpy array
    if torch.is_tensor(act):
        act = act.cpu().numpy()
    # modify act
    assert len(act.shape) <= 2, f"Unknown action format with shape {act.shape}."
    if len(act.shape) == 1:
        return np.array([np.argmin(np.abs(mesh[i] - a)) for i, a in enumerate(act)])
    return np.array([[np.argmin(np.abs(mesh[i] - a)) for i, a in enumerate(a_)] for a_ in act])

In [84]:
class SPIPolicy(BasePolicy):
    def __init__(self, sac_policy, num_actions, random_p=0.25):
        super().__init__() 
        self.sac_policy = sac_policy
        self.num_actions = num_actions
        self.random_p = random_p

    def forward(self, batch, state=None, **kwargs):
        sac_act = self.sac_policy(batch, state, **kwargs).act
        sac_act = reverse_action(sac_act)
        # random action
        act = np.zeros_like(sac_act)
        for i in range(len(act)):
            act[i] = np.random.choice(self.num_actions)
        for i in range(len(act)):
            if np.random.rand() > self.random_p:
                act[i] = sac_act[i]
        return Batch(act=act)
    
    def learn(self, batch, **kwargs):
        return {}
    
    def train(self, mode: bool = True) -> "SPIPolicy":
        self.sac_policy.train(mode)
        return self

In [90]:
spi_policy = SPIPolicy(sac_policy, num_actions, random_p=0.05)

In [93]:
spi_buffer = VectorReplayBuffer(100000, 20)

In [94]:
spi_policy.eval()
spi_collector = Collector(spi_policy, env, spi_buffer)
spi_result = spi_collector.collect(n_episode=1000)
spi_result

{'n/ep': 1000,
 'n/st': 454727,
 'rews': array([ 466.29402436,  534.00083536,  787.97912797,  751.56875933,
         829.85546261,  835.00265311,  967.63967629,  992.2478825 ,
        1007.60279812, 1300.96996581, 1334.51360064, 1320.56080531,
        1598.8445459 , 1636.69301338,  764.02490777, 1827.89017545,
        2018.69810302, 2013.78579056, 1097.94501355,  766.42761231,
        1017.996668  , 1398.07446859,  975.86828863, 1778.72981464,
        1978.67753744,  686.91904831, 1548.51003024,  746.29481742,
        1347.98738655, 3145.07010841, 3175.88056512, 2536.24148643,
        1730.43598641, 2165.30080499, 1956.68293232, 3524.01153751,
         730.58407188, 2644.70760943, 1736.08733795, 1977.63401709,
        1023.80944575, 1936.1691252 ,  843.53111992, 1676.21517354,
         690.46404973, 3549.79481835,  940.82862075, 1071.6732077 ,
         739.77473389,  834.6719128 , 2734.53733099, 3011.41920937,
        1585.02261023, 2827.40720643, 2388.54765838, 1934.41877052,
        

In [96]:
spi_buffer.save_hdf5("spi_buffer.hdf5")

In [49]:
envs = SubprocVectorEnv([lambda: gym.make(args.task) for _ in range(10)])

In [50]:
cql_policy.eval()
cql_collector = Collector(cql_policy, envs)
cql_result = cql_collector.collect(n_episode=40)
cql_result

{'n/ep': 40,
 'n/st': 28438,
 'rews': array([1875.527446  , 1985.02329145, 2076.8943044 , 2169.54324235,
        2220.29535078, 2354.63864142, 2632.09455398, 2773.12230279,
        2846.51336189, 3324.07538659, 1513.33676209, 1412.20228822,
        2114.93811745, 2183.43096061, 2097.08056415, 2691.77921814,
        1813.60546139, 2448.4467662 , 2040.81795527, 2051.75918271,
        3283.1869963 , 3282.86971813, 1459.23786367, 1888.89257348,
        2685.39992789, 1867.88289683, 2976.89779533, 3031.58666299,
        2150.55391171, 2744.34696107, 2319.66226352, 3271.27851469,
        1871.13751454, 2172.16562337, 2238.02661416, 2573.74728189,
        1605.1772629 , 2721.9337948 , 3075.98032085, 2501.72036928]),
 'lens': array([ 566,  605,  630,  653,  669,  709,  790,  832,  851, 1000,  460,
         439,  627,  648,  638,  809,  552,  738,  613,  618, 1000, 1000,
         446,  574,  807,  563,  890,  904,  645,  818,  690, 1000,  563,
         656,  673,  776,  491,  818,  919,  758]),

In [51]:
sac_policy.eval()
sac_collector = Collector(sac_policy, envs)
sac_result = sac_collector.collect(n_episode=40)
sac_result

{'n/ep': 40,
 'n/st': 39312,
 'rews': array([3490.20986974, 3684.70606198, 3591.47819272, 3597.37233087,
        3538.29855872, 3591.69464872, 3520.78922286, 3586.09690681,
        3588.96709079, 3575.0735309 , 2592.20752796, 3528.43318319,
        3657.0264336 , 3555.0225114 , 3555.33762397, 3567.77300397,
        3541.87593344, 3712.8363668 , 3585.36534082, 3566.03740618,
        3596.09974697, 3376.00707725, 3588.74366652, 3560.44204471,
        3554.94183537, 3548.74198787, 3578.99220746, 3556.44991665,
        3535.1903398 , 3556.28216648, 3566.60430196, 2529.72685258,
        3524.95554888, 3556.10858374, 3562.88478265, 3559.90796793,
        3591.3873296 , 3551.17067614, 3513.85777727, 3563.2662359 ]),
 'lens': array([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,  701,
        1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,  898,
        1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,  713, 1000,
        1000, 1000, 1000, 1000, 1000, 1000, 1000]),

In [81]:
from tianshou.data import ReplayBuffer
from tianshou.data.types import RolloutBatchProtocol


class MixedPolicy(BasePolicy):
    def __init__(self, base_policy: BasePolicy, expert_policy: BasePolicy, action_space, buffer):
        super().__init__(action_space=action_space, action_scaling=True)
        self.base_policy = base_policy
        self.expert_policy = expert_policy
        self.device = device
        self.buffer = buffer

    def forward(self, batch: RolloutBatchProtocol, state=None, **kwargs):
        bsz = len(batch.obs)
        batch = to_torch(batch, dtype=torch.float32, device=self.device)
        self.base_policy.eval()
        self.expert_policy.eval()
        with torch.no_grad():
            expert_result = self.expert_policy(batch)
            base_result = self.base_policy(batch)
            expert_qvalues1 = self.expert_policy.critic1(batch.obs, expert_result.act)
            expert_qvalues2 = self.expert_policy.critic2(batch.obs, expert_result.act)
            expert_qvalues = torch.minimum(expert_qvalues1, expert_qvalues2)
            base_qvalues1 = self.base_policy.critic1(batch.obs, base_result.act)
            base_qvalues2 = self.base_policy.critic2(batch.obs, base_result.act)
            base_qvalues = torch.minimum(base_qvalues1, base_qvalues2)
        cede_ctrl = base_qvalues < expert_qvalues
        actions = torch.where(cede_ctrl, expert_result.act, base_result.act)
        return Batch(**{'act': actions, 'policy': Batch({'cede_ctrl': cede_ctrl})})

    def train(self, mode: bool = True) -> "MixedPolicy":
        self.base_policy.train(mode)
        return self
    
    def process_fn(self, batch: RolloutBatchProtocol, buffer: ReplayBuffer, indices: np.ndarray) -> RolloutBatchProtocol:
        return self.base_policy.process_fn(batch, buffer, indices)

    def learn(self, batch, **kwargs):
        # cede_ctrl = batch.policy.cede_ctrl.cpu().squeeze()
        # train_batch = batch[~cede_ctrl]
        # info = self.base_policy.learn(train_batch)
        info = self.base_policy.learn(batch)
        return info

In [11]:
test_buffer = VectorReplayBuffer(5000, 5)

In [82]:
mixed_policy = MixedPolicy(cql_policy, sac_policy, env.action_space, test_buffer)

In [12]:
mixed_collector = Collector(mixed_policy, envs)
mixed_result = mixed_collector.collect(n_episode=40)
mixed_result

{'n/ep': 40,
 'n/st': 39442,
 'rews': array([2877.85420345, 3579.38976417, 3566.2887685 , 3612.26257751,
        3648.73333248, 3655.45149677, 3559.89113674, 3552.6419363 ,
        3588.0665408 , 3578.10143437, 3576.98877615, 3561.46943533,
        3558.70658515, 3598.58161738, 3558.16991895, 3525.51417879,
        3579.98865515, 3558.74737203, 3518.58784384, 3641.33416581,
        3563.76800552, 3231.92112228, 3558.71492095, 3535.73037386,
        3558.96929922, 3583.61728139, 3556.20513785, 3542.30987626,
        3584.5683532 , 3573.77252436, 2903.35971768, 3574.50541953,
        3551.28549583, 3540.74278544, 3513.75433021, 3546.94601085,
        3569.82593565, 3520.37927402, 3546.47417846, 3562.03266267]),
 'lens': array([ 798, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
        1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,  869,
        1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,  775, 1000, 1000,
        1000, 1000, 1000, 1000, 1000, 1000, 1000]),

In [59]:
class DeterministicPolicy(BasePolicy):
    def __init__(self, policy, action_space):
        super().__init__(action_space=action_space, action_scaling=True)
        self.policy = policy

    def train(self, mode: bool = True) -> "DeterministicPolicy":
        self.policy.train(mode)
        return self
    
    def forward(self, batch, state=None, **kwargs):
        self.policy.eval()
        return self.policy(batch)
    
    def learn(self, batch, **kwargs):
        info = self.policy.learn(batch)
        return info
    
    def process_fn(self, batch: RolloutBatchProtocol, buffer: ReplayBuffer, indices: np.ndarray) -> RolloutBatchProtocol:
        return self.policy.process_fn(batch, buffer, indices)

In [60]:
det_cql = DeterministicPolicy(cql_policy, env.action_space)

In [None]:
det_cql_collector = Collector(cql_policy, envs)
cql_result = cql_collector.collect(n_episode=40)
cql_result

In [52]:
test_envs = SubprocVectorEnv([lambda: gym.make(args.task) for _ in range(5)])

In [13]:
offline_data = load_buffer_d4rl("hopper-medium-v2")

  logger.warn(
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
load datafile: 100%|██████████| 21/21 [00:01<00:00, 16.48it/s]


In [53]:
policy = cql_policy

In [54]:
test_buffer = VectorReplayBuffer(5000, 5)
train_collector = Collector(policy, env, offline_data)
test_collector = Collector(policy, test_envs, test_buffer)



In [55]:
# log
now = datetime.datetime.now().strftime("%y%m%d-%H%M%S")
log_name = os.path.join(args.task, policy.__class__.__name__, now)
log_path = os.path.join("/data/user/R901105/dev/log", log_name)
writer = SummaryWriter(log_path)
logger = TensorboardLogger(writer)
print(log_path)

/data/user/R901105/dev/log/Hopper-v2/SACPolicy/240410-124843


In [15]:
def get_returns(policy, return_cede_ctrl=False):
    policy.eval()
    returns = []
    cede_ctrl = []
    for _ in range(5):
        obs, info = env.reset()
        done = False
        cum_reward = 0
        actions = []
        if return_cede_ctrl:
            cctrl_ep = []
        while not done:
            batch = Batch(obs=torch.from_numpy(np.expand_dims(obs, 0)).to(device), info=info)
            with torch.no_grad():
                result = policy(batch)
            act = result.act.cpu().squeeze().numpy()
            act = policy.map_action(act)
            if return_cede_ctrl:
                cctrl_ep.append(result.policy.cede_ctrl.cpu().squeeze().numpy())
            obs, reward, terminated, truncated, info = env.step(act)
            actions.append(act)
            cum_reward += reward
            done = terminated or truncated
        returns.append(cum_reward)
        if return_cede_ctrl:
            cede_ctrl.append(np.array(cctrl_ep).mean())
    if return_cede_ctrl:
        return np.array(returns),  np.array(cede_ctrl).mean()
    return np.array(returns), _

In [16]:
def test_fn(num_epoch: int, step_idx: int):
    returns, cede_ctrl = get_returns(policy, True)
    print(returns.mean(), cede_ctrl.mean())

In [56]:
result = OffpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    # test_fn=test_fn,
    max_epoch=200,
    step_per_epoch=1000,
    step_per_collect=1,
    episode_per_test=1,
    batch_size=256,
    logger=logger,
    update_per_step=1,
    test_in_train=False,
).run()

Epoch #1: 1001it [00:15, 65.10it/s, alpha=0.322, env_step=1000, gradient_step=1000, len=886, loss/actor=-265.550, loss/alpha=-1.427, loss/critic1=10.221, loss/critic2=10.537, n/ep=0, n/st=1, rew=2811.60]                          


Epoch #1: test_reward: 3260.866312 ± 0.000000, best_reward: 3260.866312 ± 0.000000 in #1


Epoch #2: 1001it [00:15, 65.87it/s, alpha=0.289, env_step=2000, gradient_step=2000, len=611, loss/actor=-262.574, loss/alpha=-1.800, loss/critic1=5.124, loss/critic2=5.014, n/ep=0, n/st=1, rew=2081.29]                           


Epoch #2: test_reward: 1326.888789 ± 0.000000, best_reward: 3260.866312 ± 0.000000 in #1


Epoch #3: 1001it [00:15, 65.20it/s, alpha=0.260, env_step=3000, gradient_step=3000, len=480, loss/actor=-260.373, loss/alpha=-2.026, loss/critic1=3.925, loss/critic2=3.763, n/ep=0, n/st=1, rew=1496.70]                            


Epoch #3: test_reward: 1898.176050 ± 0.000000, best_reward: 3260.866312 ± 0.000000 in #1


Epoch #4: 1001it [00:15, 65.43it/s, alpha=0.235, env_step=4000, gradient_step=4000, len=1000, loss/actor=-260.371, loss/alpha=-2.095, loss/critic1=3.593, loss/critic2=3.786, n/ep=0, n/st=1, rew=3148.02]                          


Epoch #4: test_reward: 3177.207976 ± 0.000000, best_reward: 3260.866312 ± 0.000000 in #1


Epoch #5: 1001it [00:15, 66.35it/s, alpha=0.213, env_step=5000, gradient_step=5000, len=1000, loss/actor=-260.375, loss/alpha=-2.271, loss/critic1=10.044, loss/critic2=9.954, n/ep=0, n/st=1, rew=3121.00]                          


Epoch #5: test_reward: 53.812679 ± 0.000000, best_reward: 3260.866312 ± 0.000000 in #1


Epoch #6: 1001it [00:15, 65.27it/s, alpha=0.193, env_step=6000, gradient_step=6000, len=40, loss/actor=-259.185, loss/alpha=-2.268, loss/critic1=3.691, loss/critic2=3.877, n/ep=0, n/st=1, rew=61.01]                               


Epoch #6: test_reward: 3190.796839 ± 0.000000, best_reward: 3260.866312 ± 0.000000 in #1


Epoch #7: 1001it [00:15, 65.12it/s, alpha=0.175, env_step=7000, gradient_step=7000, len=40, loss/actor=-259.779, loss/alpha=-2.382, loss/critic1=9.204, loss/critic2=9.187, n/ep=0, n/st=1, rew=60.68]                             


Epoch #7: test_reward: 2818.174612 ± 0.000000, best_reward: 3260.866312 ± 0.000000 in #1


Epoch #8: 1001it [00:15, 65.13it/s, alpha=0.159, env_step=8000, gradient_step=8000, len=15, loss/actor=-259.874, loss/alpha=-2.250, loss/critic1=6.532, loss/critic2=6.595, n/ep=0, n/st=1, rew=15.43]                              


Epoch #8: test_reward: 3205.285848 ± 0.000000, best_reward: 3260.866312 ± 0.000000 in #1


Epoch #9: 1001it [00:15, 63.57it/s, alpha=0.145, env_step=9000, gradient_step=9000, len=1000, loss/actor=-260.603, loss/alpha=-2.286, loss/critic1=8.661, loss/critic2=8.682, n/ep=0, n/st=1, rew=3099.25]                           


Epoch #9: test_reward: 93.568444 ± 0.000000, best_reward: 3260.866312 ± 0.000000 in #1


Epoch #10: 1001it [00:16, 62.49it/s, alpha=0.133, env_step=10000, gradient_step=10000, len=1000, loss/actor=-260.871, loss/alpha=-2.051, loss/critic1=10.982, loss/critic2=11.041, n/ep=0, n/st=1, rew=3106.07]                          


Epoch #10: test_reward: 3200.761240 ± 0.000000, best_reward: 3260.866312 ± 0.000000 in #1


Epoch #11: 1001it [00:15, 63.87it/s, alpha=0.120, env_step=11000, gradient_step=11000, len=545, loss/actor=-260.761, loss/alpha=-2.331, loss/critic1=8.719, loss/critic2=8.689, n/ep=0, n/st=1, rew=1809.51]                            


Epoch #11: test_reward: 3225.247934 ± 0.000000, best_reward: 3260.866312 ± 0.000000 in #1


Epoch #12: 1001it [00:15, 62.82it/s, alpha=0.109, env_step=12000, gradient_step=12000, len=1000, loss/actor=-260.911, loss/alpha=-2.420, loss/critic1=5.742, loss/critic2=5.691, n/ep=0, n/st=1, rew=3177.16]                           


Epoch #12: test_reward: 3307.211307 ± 0.000000, best_reward: 3307.211307 ± 0.000000 in #12


Epoch #13: 1001it [00:15, 63.60it/s, alpha=0.099, env_step=13000, gradient_step=13000, len=1000, loss/actor=-261.820, loss/alpha=-2.305, loss/critic1=1.885, loss/critic2=1.850, n/ep=0, n/st=1, rew=3242.19]                           


Epoch #13: test_reward: 3260.383498 ± 0.000000, best_reward: 3307.211307 ± 0.000000 in #12


Epoch #14: 1001it [00:15, 62.88it/s, alpha=0.091, env_step=14000, gradient_step=14000, len=48, loss/actor=-262.184, loss/alpha=-1.918, loss/critic1=10.660, loss/critic2=10.623, n/ep=0, n/st=1, rew=80.59]                             


Epoch #14: test_reward: 3329.365953 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #15: 1001it [00:15, 66.00it/s, alpha=0.083, env_step=15000, gradient_step=15000, len=487, loss/actor=-263.310, loss/alpha=-2.172, loss/critic1=4.752, loss/critic2=4.590, n/ep=0, n/st=1, rew=1583.51]                           


Epoch #15: test_reward: 1176.842563 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #16: 1001it [00:15, 65.61it/s, alpha=0.076, env_step=16000, gradient_step=16000, len=728, loss/actor=-263.784, loss/alpha=-2.006, loss/critic1=9.437, loss/critic2=9.470, n/ep=0, n/st=1, rew=2457.94]                           


Epoch #16: test_reward: 3183.859747 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #17: 1001it [00:15, 63.97it/s, alpha=0.069, env_step=17000, gradient_step=17000, len=322, loss/actor=-264.162, loss/alpha=-2.270, loss/critic1=11.497, loss/critic2=11.672, n/ep=0, n/st=1, rew=1026.24]                          


Epoch #17: test_reward: 1424.680497 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #18: 1001it [00:15, 65.54it/s, alpha=0.063, env_step=18000, gradient_step=18000, len=381, loss/actor=-264.941, loss/alpha=-1.701, loss/critic1=8.659, loss/critic2=8.703, n/ep=0, n/st=1, rew=1294.72]                           


Epoch #18: test_reward: 1171.396532 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #19: 1001it [00:15, 63.09it/s, alpha=0.059, env_step=19000, gradient_step=19000, len=393, loss/actor=-265.134, loss/alpha=-1.658, loss/critic1=8.378, loss/critic2=8.404, n/ep=0, n/st=1, rew=1324.98]                           


Epoch #19: test_reward: 3300.828020 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #20: 1001it [00:16, 59.31it/s, alpha=0.054, env_step=20000, gradient_step=20000, len=282, loss/actor=-265.586, loss/alpha=-1.247, loss/critic1=1.485, loss/critic2=1.415, n/ep=0, n/st=1, rew=904.67]                            


Epoch #20: test_reward: 3273.845159 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #21: 1001it [00:16, 61.61it/s, alpha=0.050, env_step=21000, gradient_step=21000, len=893, loss/actor=-266.560, loss/alpha=-1.436, loss/critic1=5.132, loss/critic2=5.044, n/ep=0, n/st=1, rew=2993.57]                           


Epoch #21: test_reward: 3312.367075 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #22: 1001it [00:15, 64.61it/s, alpha=0.047, env_step=22000, gradient_step=22000, len=1000, loss/actor=-267.117, loss/alpha=-0.441, loss/critic1=15.037, loss/critic2=15.056, n/ep=0, n/st=1, rew=3281.39]                          


Epoch #22: test_reward: 3217.425750 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #23: 1001it [00:15, 64.85it/s, alpha=0.045, env_step=23000, gradient_step=23000, len=398, loss/actor=-268.530, loss/alpha=-0.625, loss/critic1=1.309, loss/critic2=1.254, n/ep=0, n/st=1, rew=1298.35]                           


Epoch #23: test_reward: 3200.088950 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #24: 1001it [00:15, 64.13it/s, alpha=0.045, env_step=24000, gradient_step=24000, len=410, loss/actor=-269.780, loss/alpha=-0.206, loss/critic1=7.336, loss/critic2=7.449, n/ep=0, n/st=1, rew=1340.57]                           


Epoch #24: test_reward: 732.123650 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #25: 1001it [00:15, 64.68it/s, alpha=0.042, env_step=25000, gradient_step=25000, len=487, loss/actor=-270.895, loss/alpha=-0.280, loss/critic1=1.390, loss/critic2=1.472, n/ep=0, n/st=1, rew=1639.17]                           


Epoch #25: test_reward: 2438.450644 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #26: 1001it [00:15, 62.87it/s, alpha=0.040, env_step=26000, gradient_step=26000, len=473, loss/actor=-271.565, loss/alpha=0.088, loss/critic1=1.906, loss/critic2=2.006, n/ep=0, n/st=1, rew=1552.92]                            


Epoch #26: test_reward: 1262.026932 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #27: 1001it [00:15, 64.69it/s, alpha=0.039, env_step=27000, gradient_step=27000, len=149, loss/actor=-271.593, loss/alpha=0.456, loss/critic1=7.759, loss/critic2=7.688, n/ep=0, n/st=1, rew=357.85]                             


Epoch #27: test_reward: 3273.691890 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #28: 1001it [00:15, 63.13it/s, alpha=0.039, env_step=28000, gradient_step=28000, len=769, loss/actor=-273.016, loss/alpha=-0.258, loss/critic1=5.192, loss/critic2=5.241, n/ep=0, n/st=1, rew=2522.71]                           


Epoch #28: test_reward: 3240.383346 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #29: 1001it [00:15, 65.11it/s, alpha=0.039, env_step=29000, gradient_step=29000, len=347, loss/actor=-274.026, loss/alpha=0.828, loss/critic1=5.917, loss/critic2=5.727, n/ep=0, n/st=1, rew=1094.30]                            


Epoch #29: test_reward: 1415.249577 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #30: 1001it [00:15, 64.50it/s, alpha=0.039, env_step=30000, gradient_step=30000, len=682, loss/actor=-274.763, loss/alpha=-0.195, loss/critic1=3.668, loss/critic2=3.385, n/ep=0, n/st=1, rew=2226.31]                          


Epoch #30: test_reward: 3144.218099 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #31: 1001it [00:15, 65.39it/s, alpha=0.039, env_step=31000, gradient_step=31000, len=415, loss/actor=-276.024, loss/alpha=-0.080, loss/critic1=12.008, loss/critic2=12.094, n/ep=0, n/st=1, rew=1257.26]                          


Epoch #31: test_reward: 845.686400 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #32: 1001it [00:15, 64.41it/s, alpha=0.037, env_step=32000, gradient_step=32000, len=256, loss/actor=-275.299, loss/alpha=-0.339, loss/critic1=8.891, loss/critic2=9.168, n/ep=0, n/st=1, rew=739.51]                            


Epoch #32: test_reward: 1338.828161 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #33: 1001it [00:15, 65.09it/s, alpha=0.037, env_step=33000, gradient_step=33000, len=234, loss/actor=-276.460, loss/alpha=-0.438, loss/critic1=2.755, loss/critic2=2.798, n/ep=0, n/st=1, rew=658.83]                            


Epoch #33: test_reward: 487.291959 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #34: 1001it [00:15, 64.59it/s, alpha=0.036, env_step=34000, gradient_step=34000, len=315, loss/actor=-276.764, loss/alpha=-0.594, loss/critic1=1.260, loss/critic2=1.232, n/ep=0, n/st=1, rew=1030.06]                           


Epoch #34: test_reward: 1292.134359 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #35: 1001it [00:15, 64.69it/s, alpha=0.035, env_step=35000, gradient_step=35000, len=177, loss/actor=-277.023, loss/alpha=-0.319, loss/critic1=1.433, loss/critic2=1.318, n/ep=0, n/st=1, rew=447.28]                            


Epoch #35: test_reward: 1299.054152 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #36: 1001it [00:15, 64.16it/s, alpha=0.034, env_step=36000, gradient_step=36000, len=408, loss/actor=-278.689, loss/alpha=0.449, loss/critic1=10.685, loss/critic2=10.821, n/ep=0, n/st=1, rew=1387.07]                          


Epoch #36: test_reward: 1043.972100 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #37: 1001it [00:15, 65.97it/s, alpha=0.033, env_step=37000, gradient_step=37000, len=555, loss/actor=-277.856, loss/alpha=-0.600, loss/critic1=4.822, loss/critic2=4.600, n/ep=0, n/st=1, rew=1813.91]                           


Epoch #37: test_reward: 2093.490821 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #38: 1001it [00:15, 65.59it/s, alpha=0.032, env_step=38000, gradient_step=38000, len=645, loss/actor=-279.103, loss/alpha=-0.622, loss/critic1=4.356, loss/critic2=4.405, n/ep=0, n/st=1, rew=2143.51]                           


Epoch #38: test_reward: 442.078345 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #39: 1001it [00:15, 65.19it/s, alpha=0.032, env_step=39000, gradient_step=39000, len=1000, loss/actor=-280.294, loss/alpha=-0.123, loss/critic1=1.419, loss/critic2=1.382, n/ep=0, n/st=1, rew=3147.79]                          


Epoch #39: test_reward: 274.839154 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #40: 1001it [00:15, 64.60it/s, alpha=0.033, env_step=40000, gradient_step=40000, len=71, loss/actor=-280.795, loss/alpha=0.029, loss/critic1=5.008, loss/critic2=5.015, n/ep=0, n/st=1, rew=126.99]                             


Epoch #40: test_reward: 3261.442432 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #41: 1001it [00:15, 64.88it/s, alpha=0.033, env_step=41000, gradient_step=41000, len=454, loss/actor=-281.532, loss/alpha=-0.205, loss/critic1=1.790, loss/critic2=1.591, n/ep=0, n/st=1, rew=1495.74]                           


Epoch #41: test_reward: 440.107579 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #42: 1001it [00:15, 64.60it/s, alpha=0.034, env_step=42000, gradient_step=42000, len=630, loss/actor=-281.022, loss/alpha=-0.223, loss/critic1=6.039, loss/critic2=5.964, n/ep=0, n/st=1, rew=2058.42]                           


Epoch #42: test_reward: 1227.332181 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #43: 1001it [00:15, 64.54it/s, alpha=0.035, env_step=43000, gradient_step=43000, len=110, loss/actor=-282.164, loss/alpha=-0.225, loss/critic1=9.286, loss/critic2=9.223, n/ep=0, n/st=1, rew=240.18]                           


Epoch #43: test_reward: 309.013252 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #44: 1001it [00:15, 64.44it/s, alpha=0.034, env_step=44000, gradient_step=44000, len=1000, loss/actor=-283.109, loss/alpha=0.165, loss/critic1=4.522, loss/critic2=4.326, n/ep=0, n/st=1, rew=3251.19]                          


Epoch #44: test_reward: 273.888413 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #45: 1001it [00:15, 64.44it/s, alpha=0.034, env_step=45000, gradient_step=45000, len=125, loss/actor=-282.670, loss/alpha=-0.701, loss/critic1=5.531, loss/critic2=5.238, n/ep=0, n/st=1, rew=258.46]                             


Epoch #45: test_reward: 1453.622562 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #46: 1001it [00:15, 64.99it/s, alpha=0.033, env_step=46000, gradient_step=46000, len=133, loss/actor=-283.188, loss/alpha=-0.569, loss/critic1=12.549, loss/critic2=12.494, n/ep=0, n/st=1, rew=289.48]                          


Epoch #46: test_reward: 404.302935 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #47: 1001it [00:15, 64.57it/s, alpha=0.033, env_step=47000, gradient_step=47000, len=126, loss/actor=-284.347, loss/alpha=-0.333, loss/critic1=5.361, loss/critic2=5.421, n/ep=0, n/st=1, rew=272.86]                            


Epoch #47: test_reward: 1156.083544 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #48: 1001it [00:15, 64.77it/s, alpha=0.033, env_step=48000, gradient_step=48000, len=170, loss/actor=-283.064, loss/alpha=0.107, loss/critic1=12.433, loss/critic2=12.526, n/ep=0, n/st=1, rew=419.88]                           


Epoch #48: test_reward: 424.777489 ± 0.000000, best_reward: 3329.365953 ± 0.000000 in #14


Epoch #49:  16%|#6        | 160/1000 [00:02<00:12, 64.64it/s, alpha=0.033, env_step=48160, gradient_step=48160, len=170, loss/actor=-284.213, loss/alpha=0.576, loss/critic1=21.312, loss/critic2=21.094, n/ep=0, n/st=1, rew=419.88]


KeyboardInterrupt: 