In [1]:
import re
import os
import argparse
import datetime
from types import SimpleNamespace

import torch
from torch import nn
import torch.optim as optim
import numpy as np
import gymnasium as gym

from tianshou.data import Collector, Batch, to_torch
from tianshou.data.types import RolloutBatchProtocol
from tianshou.data.buffer.vecbuf import VectorReplayBuffer, ReplayBuffer
from tianshou.env import SubprocVectorEnv
from tianshou.policy import RBVEPolicy, BasePolicy
from tianshou.utils.net.common import Net
from tianshou.utils.net.continuous import ActorProb, Critic
from examples.offline.utils import load_buffer_d4rl
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger
from tianshou.trainer import OffpolicyTrainer
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/glfw/__init__.py:916: GLFWError: (65544) b'X11: The DISPLAY environment variable is missing'
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/gymnasium/envs/registration.py", line 594, in load_plugin_envs
    fn()
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/shimmy/registration.py", line 262, in register_gymnasium_envs
    _register_dm_control_envs()
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/shimmy/registration.py", line 26, in _register_dm_control_envs
    from shimmy.dm_control_compatibility import DmControlCompatibilityV0
  File "/data/user/R901105/.conda/envs/dev/lib/python3.11/site-packages/shimmy/dm_control_compatibility.py", line 12, in <module>
    import dm_env
ModuleNotFoundError: No module named 'dm_env'
[0m
  logger.warn(f"plugin: {plugin.value} raised {traceback.format_exc()}")
No module named 'mjrl'
No module named 'flow'
No modu

In [2]:
device = "cuda:3"

In [3]:
def parse_value(value):
    # Convert simple types (int, float, bool, None)
    if value.isdigit():
        return int(value)
    elif re.match(r'^\d+\.\d+$', value):
        return float(value)
    elif value == "True":
        return True
    elif value == "False":
        return False
    elif value == "None":
        return None
    elif value.startswith("[") and value.endswith("]"):
        # Convert the list items
        items = re.split(r',(?=[^\]]*(?:\[|$))', value[1:-1])
        return [parse_value(item.strip()) for item in items]
    elif value.startswith("(") and value.endswith(")"):
        # Convert the tuple items
        items = re.split(r',(?=[^\)]*(?:\(|$))', value[1:-1])
        # Special case for single-item tuple
        if len(items) == 2 and items[0].strip() != '':
            return (parse_value(items[0].strip()),)
        return tuple(parse_value(item.strip()) for item in items)
    elif value.startswith("'") and value.endswith("'"):
        return value[1:-1]
    # Else, return the value as-is
    return value

def get_args(event_file):
    ea = EventAccumulator(event_file)
    ea.Reload()  # Load the file
    # Get the text data
    texts = ea.Tags()["tensors"]
    # Extract the actual text content
    text_data = {}
    for tag in texts:
        events = ea.Tensors(tag)
        for event in events:
            # You can extract the wall_time and step if needed
            # wall_time, step, value = event.wall_time, event.step, event.text
            text_data[tag] = event.tensor_proto.string_val
    data = text_data['args/text_summary'][0]
    # Convert bytes to string
    data_str = data.decode('utf-8')
    # Remove the "Namespace(" prefix and the trailing ")"
    data_str = data_str[len("Namespace("):-1]
    # Split into key-value pairs
    key_values = re.split(r',(?=\s\w+=)', data_str)
    # Parse each key-value pair
    args_dict = {}
    for kv in key_values:
        key, value = kv.split('=', 1)
        key = key.strip()
        args_dict[key] = parse_value(value)
    args = SimpleNamespace(**args_dict)
    return args

In [4]:
log_path = "/data/user/R901105/dev/log/Hopper-v2/rbve/1/240502-103556"
files = os.listdir(log_path)
event_file = [f for f in files if f.startswith('event')][0]
full_path = os.path.join(log_path, event_file)
args = get_args(full_path)
args

namespace(task='Hopper-v2',
          seed=1,
          expert_data_task='hopper-medium-v2',
          ac_path='/data/user/R901105/dev/log/Hopper-v2/action_classification/0/240427-225051/model.pt',
          ac_hidden_sizes=[512, 512, 512],
          buffer_size=1000000,
          hidden_sizes=[256, 256, 256],
          actor_lr=0.0001,
          critic_lr=0.0003,
          q_marge=5.0,
          alpha='(-3, tensor([0.]',
          device='cuda:1',
          requires_grad='True), Adam (\nParameter Group 0\n    amsgrad: False\n    betas: (0.9, 0.999)\n    capturable: False\n    differentiable: False\n    eps: 1e-08\n    foreach: None\n    fused: None\n    lr: 0.0001\n    maximize: False\n    weight_decay: 0\n))',
          auto_alpha=True,
          alpha_lr=0.0001,
          cql_alpha_lr=0.0003,
          start_timesteps=10000,
          epoch=1000,
          step_per_epoch=5000,
          batch_size=256,
          norm_layer=True,
          tau=0.005,
          temperature=1.0,
      

In [5]:
class MyModel(nn.Module):
    def __init__(self, input_size, hidden_sizes):
        super(MyModel, self).__init__()
        self.input_layer = nn.Linear(input_size, hidden_sizes[0])
        self.input_norm = nn.LayerNorm(hidden_sizes[0])
        self.hidden_layers = nn.ModuleList([
            nn.Linear(hidden_sizes[i], hidden_sizes[i+1])
            for i in range(len(hidden_sizes) - 1)
        ])
        self.hidden_norms = nn.ModuleList([
            nn.LayerNorm(hidden_sizes[i+1])
            for i in range(len(hidden_sizes) - 1)
        ])
        self.output_layer = nn.Linear(hidden_sizes[-1], 1)  # 1 output for energy score
    
    def forward(self, x):
        x = self.input_layer(x)
        x = self.input_norm(x)
        x = torch.relu(x)
        for hidden_layer, hidden_norm in zip(self.hidden_layers, self.hidden_norms):
            x = hidden_layer(x)
            x = hidden_norm(x)
            x = torch.relu(x)
        output = self.output_layer(x)
        return torch.sigmoid(output)

In [6]:
def load_policy(path, alpha):
    env = gym.make(args.task)

    # action classifier
    id_model = MyModel(args.state_dim + args.action_dim, args.ac_hidden_sizes).to(device)
    id_model.load_state_dict(torch.load(args.ac_path))
    print("Loaded AC model from:", args.ac_path)

    # actor network
    net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=device)
    actor = ActorProb(
        net_a,
        args.action_shape,
        device=device,
        unbounded=True,
        conditioned_sigma=True,
    ).to(device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)

    # critic network
    net_c1 = Net(
        args.state_shape,
        args.action_shape,
        hidden_sizes=args.hidden_sizes,
        concat=True,
        device=device,
        norm_layer=nn.LayerNorm if args.norm_layer else None
    )
    net_c2 = Net(
        args.state_shape,
        args.action_shape,
        hidden_sizes=args.hidden_sizes,
        concat=True,
        device=device,
        norm_layer=nn.LayerNorm if args.norm_layer else None
    )
    critic1 = Critic(net_c1, device=device).to(device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    critic2 = Critic(net_c2, device=device).to(device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)

    if args.auto_alpha:
        target_entropy = -np.prod(env.action_space.shape)
        log_alpha = torch.tensor([np.log(alpha)], requires_grad=True, device=device, dtype=torch.float32)
        alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr)
        args.alpha = (target_entropy, log_alpha, alpha_optim)

    policy = RBVEPolicy(
        actor,
        actor_optim,
        critic1,
        critic1_optim,
        critic2,
        critic2_optim,
        id_model=id_model,
        action_space=env.action_space,
        cql_alpha_lr=args.cql_alpha_lr,
        cql_weight=args.cql_weight,
        tau=args.tau,
        gamma=args.gamma,
        alpha=args.alpha,
        temperature=args.temperature,
        with_lagrange=args.with_lagrange,
        lagrange_threshold=args.lagrange_threshold,
        min_action=np.min(env.action_space.low),
        max_action=np.max(env.action_space.high),
        calibrated=args.calibrated,
        q_marge=args.q_marge,
        device=device,
    )

    # load policy
    policy.load_state_dict(torch.load(path, map_location=device))
    print("Loaded agent from: ", path)

    return policy

In [7]:
offline_policy = load_policy("/data/user/R901105/dev/log/Hopper-v2/rbve/0/240430-111233/policy.pth", alpha=0.014)

  logger.deprecation(
  logger.deprecation(


Loaded AC model from: /data/user/R901105/dev/log/Hopper-v2/action_classification/0/240427-225051/model.pt
Loaded agent from:  /data/user/R901105/dev/log/Hopper-v2/rbve/0/240430-111233/policy.pth


In [8]:
offline_data = load_buffer_d4rl(args.expert_data_task)

  logger.warn(
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
load datafile:   0%|          | 0/21 [00:00<?, ?it/s]

load datafile: 100%|██████████| 21/21 [00:01<00:00, 17.08it/s]


In [9]:
env = gym.make(args.task)

  logger.deprecation(
  logger.deprecation(


In [10]:
class OnlineRBVEPolicy(BasePolicy):
    def __init__(self, policy: RBVEPolicy, action_space):
        super().__init__(action_space=action_space, action_scaling=True)
        self.policy = policy
        self.action_classifier = policy.id_model
        self.ac_loss_fn = nn.BCELoss()
        self.ac_optimiser = optim.SGD(self.action_classifier.parameters(), lr=1e-3)

    def train(self, mode: bool = True) -> "OnlineRBVEPolicy":
        self.policy.train(mode)
        return self
    
    def forward(self, batch: RolloutBatchProtocol, state=None, **kwargs):
        return self.policy(batch, state, **kwargs)
    
    def process_fn(self, batch: RolloutBatchProtocol, buffer: ReplayBuffer, indices: np.ndarray) -> RolloutBatchProtocol:
        return self.policy.process_fn(batch, buffer, indices)

    def get_negatives(self, batch: RolloutBatchProtocol):
        rand_actions = np.random.uniform(low=env.action_space.low, high=env.action_space.high, size=(len(batch), env.action_space.shape[0])).astype(np.float32) 
        return torch.cat([batch.obs, torch.tensor(rand_actions, dtype=torch.float32, device=device)], dim=1)   

    def train_ac(self, batch: RolloutBatchProtocol):
        batch = to_torch(batch, dtype=torch.float32, device=device)
        # positive samples
        X = torch.cat([batch.obs, batch.act], dim=1)
        y = torch.ones(len(batch), 1).to(device)
        # negative samples
        X_neg = self.get_negatives(batch)
        y_neg = torch.zeros(len(batch), 1).to(device)
        # combine and shuffle
        X = torch.cat([X, X_neg], dim=0)
        y = torch.cat([y, y_neg], dim=0)
        idx = torch.randperm(len(X))
        X = X[idx]
        y = y[idx]
        pred = self.action_classifier(X)
        loss = self.ac_loss_fn(pred, y)
        self.ac_optimiser.zero_grad()
        loss.backward()
        self.ac_optimiser.step()
        return loss.item()

    def learn(self, batch, **kwargs):
        info = self.policy.learn(batch)
        info["ac_loss"] = self.train_ac(batch)
        return info

In [11]:
policy = OnlineRBVEPolicy(offline_policy, env.action_space)

In [12]:
test_buffer = ReplayBuffer(5000)
train_collector = Collector(policy, env, offline_data)
test_collector = Collector(policy, env, test_buffer)



In [13]:
test_collector.collect(n_episode=5)

{'n/ep': 5,
 'n/st': 4288,
 'rews': array([3239.2962205 , 2347.45773982, 3240.77453363, 3238.72680753,
        1899.92545403]),
 'lens': array([1000,  706, 1000, 1000,  582]),
 'idxs': array([   0, 1000, 1706, 2706, 3706]),
 'rew': 2793.2361511012828,
 'len': 857.6,
 'rew_std': 564.7024894183991,
 'len_std': 178.75748935359323}

In [14]:
# log
now = datetime.datetime.now().strftime("%y%m%d-%H%M%S")
log_name = os.path.join(args.task, "onlinerbve", now)
log_path = os.path.join("../../log", log_name)
writer = SummaryWriter(log_path)
logger = TensorboardLogger(writer)

In [15]:
from tianshou.trainer import OffpolicyTrainer

result = OffpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=200,
    step_per_epoch=500,
    step_per_collect=1,
    episode_per_test=1,
    batch_size=512,
    logger=logger,
    update_per_step=1,
    test_in_train=False,
).run()

Epoch #1: 501it [00:20, 24.34it/s, ac_loss=0.144, alpha=0.015, dataset_in_dist=467.151, env_step=500, gradient_step=500, in_dist=4183.034, len=0, loss/actor=-250.128, loss/alpha=1.340, loss/cql1=0.434, loss/cql2=0.436, loss/critic1=0.959, loss/critic2=0.975, n/ep=0, n/st=1, q_dataset=250.192, rew=0.00]                          


Epoch #1: test_reward: 3335.694768 ± 0.000000, best_reward: 3335.694768 ± 0.000000 in #1


Epoch #2: 501it [00:20, 24.37it/s, ac_loss=0.144, alpha=0.015, dataset_in_dist=467.155, env_step=1000, gradient_step=1000, in_dist=4153.963, len=0, loss/actor=-250.263, loss/alpha=1.346, loss/cql1=0.448, loss/cql2=0.452, loss/critic1=0.967, loss/critic2=0.951, n/ep=0, n/st=1, q_dataset=250.302, rew=0.00]                         


Epoch #2: test_reward: 1062.016183 ± 0.000000, best_reward: 3335.694768 ± 0.000000 in #1


Epoch #3: 501it [00:20, 24.17it/s, ac_loss=0.144, alpha=0.016, dataset_in_dist=467.362, env_step=1500, gradient_step=1500, in_dist=4125.892, len=0, loss/actor=-250.679, loss/alpha=1.372, loss/cql1=0.460, loss/cql2=0.464, loss/critic1=0.914, loss/critic2=0.948, n/ep=0, n/st=1, q_dataset=250.718, rew=0.00]                         


Epoch #3: test_reward: 1037.722219 ± 0.000000, best_reward: 3335.694768 ± 0.000000 in #1


Epoch #4: 501it [00:20, 24.30it/s, ac_loss=0.145, alpha=0.017, dataset_in_dist=467.093, env_step=2000, gradient_step=2000, in_dist=4061.816, len=0, loss/actor=-251.414, loss/alpha=1.372, loss/cql1=0.489, loss/cql2=0.494, loss/critic1=1.010, loss/critic2=1.020, n/ep=0, n/st=1, q_dataset=251.474, rew=0.00]                         


Epoch #4: test_reward: 3280.596439 ± 0.000000, best_reward: 3335.694768 ± 0.000000 in #1


Epoch #5: 501it [00:20, 24.70it/s, ac_loss=0.147, alpha=0.018, dataset_in_dist=466.787, env_step=2500, gradient_step=2500, in_dist=4062.020, len=0, loss/actor=-250.955, loss/alpha=1.024, loss/cql1=0.504, loss/cql2=0.507, loss/critic1=0.991, loss/critic2=0.990, n/ep=0, n/st=1, q_dataset=251.030, rew=0.00]                         


Epoch #5: test_reward: 3285.708384 ± 0.000000, best_reward: 3335.694768 ± 0.000000 in #1


Epoch #6: 501it [00:20, 24.42it/s, ac_loss=0.142, alpha=0.018, dataset_in_dist=467.519, env_step=3000, gradient_step=3000, in_dist=4058.037, len=0, loss/actor=-251.561, loss/alpha=0.786, loss/cql1=0.522, loss/cql2=0.523, loss/critic1=0.994, loss/critic2=0.942, n/ep=0, n/st=1, q_dataset=251.606, rew=0.00]                         


Epoch #6: test_reward: 1904.520977 ± 0.000000, best_reward: 3335.694768 ± 0.000000 in #1


Epoch #7: 501it [00:20, 24.26it/s, ac_loss=0.143, alpha=0.019, dataset_in_dist=467.362, env_step=3500, gradient_step=3500, in_dist=4069.726, len=0, loss/actor=-251.587, loss/alpha=0.779, loss/cql1=0.512, loss/cql2=0.513, loss/critic1=0.911, loss/critic2=0.902, n/ep=0, n/st=1, q_dataset=251.644, rew=0.00]                         


Epoch #7: test_reward: 3298.508178 ± 0.000000, best_reward: 3335.694768 ± 0.000000 in #1


Epoch #8: 501it [00:20, 24.51it/s, ac_loss=0.145, alpha=0.020, dataset_in_dist=466.897, env_step=4000, gradient_step=4000, in_dist=4064.092, len=3979, loss/actor=-252.251, loss/alpha=0.571, loss/cql1=0.526, loss/cql2=0.528, loss/critic1=0.883, loss/critic2=0.928, n/ep=0, n/st=1, q_dataset=252.299, rew=12770.51]                         


Epoch #8: test_reward: 1917.384058 ± 0.000000, best_reward: 3335.694768 ± 0.000000 in #1


Epoch #9: 501it [00:20, 24.74it/s, ac_loss=0.144, alpha=0.020, dataset_in_dist=467.085, env_step=4500, gradient_step=4500, in_dist=4037.875, len=3979, loss/actor=-252.345, loss/alpha=0.683, loss/cql1=0.528, loss/cql2=0.531, loss/critic1=0.897, loss/critic2=0.923, n/ep=0, n/st=1, q_dataset=252.401, rew=12770.51]                         


Epoch #9: test_reward: 3304.478192 ± 0.000000, best_reward: 3335.694768 ± 0.000000 in #1


Epoch #10: 501it [00:20, 24.32it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=466.950, env_step=5000, gradient_step=5000, in_dist=4041.900, len=3979, loss/actor=-252.377, loss/alpha=0.359, loss/cql1=0.542, loss/cql2=0.542, loss/critic1=0.879, loss/critic2=0.906, n/ep=0, n/st=1, q_dataset=252.447, rew=12770.51]                         


Epoch #10: test_reward: 3286.419061 ± 0.000000, best_reward: 3335.694768 ± 0.000000 in #1


Epoch #11: 501it [00:21, 23.62it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.968, env_step=5500, gradient_step=5500, in_dist=4064.118, len=3979, loss/actor=-253.297, loss/alpha=0.144, loss/cql1=0.534, loss/cql2=0.538, loss/critic1=0.792, loss/critic2=0.798, n/ep=0, n/st=1, q_dataset=253.366, rew=12770.51]                         


Epoch #11: test_reward: 3291.761163 ± 0.000000, best_reward: 3335.694768 ± 0.000000 in #1


Epoch #12: 501it [00:20, 24.42it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.738, env_step=6000, gradient_step=6000, in_dist=4018.950, len=3979, loss/actor=-253.366, loss/alpha=0.015, loss/cql1=0.564, loss/cql2=0.567, loss/critic1=0.927, loss/critic2=0.974, n/ep=0, n/st=1, q_dataset=253.438, rew=12770.51]                          


Epoch #12: test_reward: 3311.073419 ± 0.000000, best_reward: 3335.694768 ± 0.000000 in #1


Epoch #13: 501it [00:19, 25.08it/s, ac_loss=0.143, alpha=0.021, dataset_in_dist=467.694, env_step=6500, gradient_step=6500, in_dist=4030.881, len=3979, loss/actor=-253.666, loss/alpha=0.081, loss/cql1=0.555, loss/cql2=0.557, loss/critic1=0.844, loss/critic2=0.877, n/ep=0, n/st=1, q_dataset=253.735, rew=12770.51]                          


Epoch #13: test_reward: 3298.602396 ± 0.000000, best_reward: 3335.694768 ± 0.000000 in #1


Epoch #14: 501it [00:19, 25.16it/s, ac_loss=0.143, alpha=0.021, dataset_in_dist=467.411, env_step=7000, gradient_step=7000, in_dist=4020.737, len=3979, loss/actor=-253.664, loss/alpha=0.035, loss/cql1=0.565, loss/cql2=0.566, loss/critic1=0.854, loss/critic2=0.858, n/ep=0, n/st=1, q_dataset=253.725, rew=12770.51]                          


Epoch #14: test_reward: 3284.571834 ± 0.000000, best_reward: 3335.694768 ± 0.000000 in #1


Epoch #15: 501it [00:19, 25.58it/s, ac_loss=0.144, alpha=0.021, dataset_in_dist=467.198, env_step=7500, gradient_step=7500, in_dist=3982.716, len=3979, loss/actor=-254.237, loss/alpha=0.042, loss/cql1=0.585, loss/cql2=0.587, loss/critic1=1.130, loss/critic2=1.146, n/ep=0, n/st=1, q_dataset=254.288, rew=12770.51]                         


Epoch #15: test_reward: 3341.361087 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #16: 501it [00:19, 25.69it/s, ac_loss=0.145, alpha=0.022, dataset_in_dist=466.908, env_step=8000, gradient_step=8000, in_dist=4012.884, len=3979, loss/actor=-254.105, loss/alpha=0.019, loss/cql1=0.570, loss/cql2=0.572, loss/critic1=0.930, loss/critic2=0.933, n/ep=0, n/st=1, q_dataset=254.160, rew=12770.51]                          


Epoch #16: test_reward: 3300.429738 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #17: 501it [00:19, 25.53it/s, ac_loss=0.146, alpha=0.022, dataset_in_dist=466.992, env_step=8500, gradient_step=8500, in_dist=4016.288, len=3979, loss/actor=-254.507, loss/alpha=-0.033, loss/cql1=0.571, loss/cql2=0.572, loss/critic1=0.919, loss/critic2=0.859, n/ep=0, n/st=1, q_dataset=254.564, rew=12770.51]                         


Epoch #17: test_reward: 3268.985031 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #18: 501it [00:19, 25.49it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.385, env_step=9000, gradient_step=9000, in_dist=4012.351, len=3979, loss/actor=-254.193, loss/alpha=-0.064, loss/cql1=0.574, loss/cql2=0.576, loss/critic1=0.900, loss/critic2=0.896, n/ep=0, n/st=1, q_dataset=254.246, rew=12770.51]                         


Epoch #18: test_reward: 3304.008350 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #19: 501it [00:19, 25.60it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.656, env_step=9500, gradient_step=9500, in_dist=4022.876, len=3979, loss/actor=-255.022, loss/alpha=0.067, loss/cql1=0.561, loss/cql2=0.562, loss/critic1=1.238, loss/critic2=1.275, n/ep=0, n/st=1, q_dataset=255.080, rew=12770.51]                          


Epoch #19: test_reward: 3303.104704 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #20: 501it [00:19, 25.80it/s, ac_loss=0.143, alpha=0.022, dataset_in_dist=467.387, env_step=10000, gradient_step=10000, in_dist=4026.548, len=3979, loss/actor=-255.406, loss/alpha=0.118, loss/cql1=0.560, loss/cql2=0.562, loss/critic1=0.958, loss/critic2=0.938, n/ep=0, n/st=1, q_dataset=255.468, rew=12770.51]                         


Epoch #20: test_reward: 3317.650493 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #21: 501it [00:19, 25.82it/s, ac_loss=0.144, alpha=0.021, dataset_in_dist=467.765, env_step=10500, gradient_step=10500, in_dist=4021.835, len=3979, loss/actor=-255.565, loss/alpha=0.016, loss/cql1=0.568, loss/cql2=0.569, loss/critic1=0.838, loss/critic2=0.867, n/ep=0, n/st=1, q_dataset=255.625, rew=12770.51]                          


Epoch #21: test_reward: 3307.709251 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #22: 501it [00:19, 25.91it/s, ac_loss=0.144, alpha=0.021, dataset_in_dist=467.606, env_step=11000, gradient_step=11000, in_dist=4011.926, len=3979, loss/actor=-255.641, loss/alpha=-0.111, loss/cql1=0.576, loss/cql2=0.577, loss/critic1=1.298, loss/critic2=1.290, n/ep=0, n/st=1, q_dataset=255.690, rew=12770.51]                         


Epoch #22: test_reward: 3322.199913 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #23: 501it [00:19, 25.53it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.346, env_step=11500, gradient_step=11500, in_dist=4018.963, len=3979, loss/actor=-256.509, loss/alpha=0.002, loss/cql1=0.569, loss/cql2=0.572, loss/critic1=0.980, loss/critic2=0.922, n/ep=0, n/st=1, q_dataset=256.560, rew=12770.51]                          


Epoch #23: test_reward: 3290.412030 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #24: 501it [00:19, 25.30it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.666, env_step=12000, gradient_step=12000, in_dist=4007.022, len=3979, loss/actor=-256.261, loss/alpha=-0.136, loss/cql1=0.577, loss/cql2=0.579, loss/critic1=1.598, loss/critic2=1.603, n/ep=0, n/st=1, q_dataset=256.323, rew=12770.51]                         


Epoch #24: test_reward: 3320.520803 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #25: 501it [00:20, 24.85it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.830, env_step=12500, gradient_step=12500, in_dist=4021.221, len=3979, loss/actor=-255.979, loss/alpha=-0.142, loss/cql1=0.570, loss/cql2=0.573, loss/critic1=0.864, loss/critic2=0.834, n/ep=0, n/st=1, q_dataset=256.050, rew=12770.51]                         


Epoch #25: test_reward: 3288.395453 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #26: 501it [00:19, 25.35it/s, ac_loss=0.144, alpha=0.021, dataset_in_dist=467.493, env_step=13000, gradient_step=13000, in_dist=4058.162, len=3979, loss/actor=-256.512, loss/alpha=-0.106, loss/cql1=0.546, loss/cql2=0.549, loss/critic1=0.923, loss/critic2=0.933, n/ep=0, n/st=1, q_dataset=256.560, rew=12770.51]                         


Epoch #26: test_reward: 3312.695529 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #27: 501it [00:20, 24.97it/s, ac_loss=0.143, alpha=0.021, dataset_in_dist=467.558, env_step=13500, gradient_step=13500, in_dist=4040.131, len=3979, loss/actor=-256.673, loss/alpha=0.044, loss/cql1=0.554, loss/cql2=0.557, loss/critic1=0.838, loss/critic2=0.830, n/ep=0, n/st=1, q_dataset=256.721, rew=12770.51]                         


Epoch #27: test_reward: 3295.938806 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #28: 501it [00:20, 24.86it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.159, env_step=14000, gradient_step=14000, in_dist=4024.472, len=3979, loss/actor=-256.442, loss/alpha=-0.102, loss/cql1=0.571, loss/cql2=0.572, loss/critic1=0.902, loss/critic2=0.935, n/ep=0, n/st=1, q_dataset=256.498, rew=12770.51]                         


Epoch #28: test_reward: 3279.218708 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #29: 501it [00:19, 25.46it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.999, env_step=14500, gradient_step=14500, in_dist=4026.212, len=3979, loss/actor=-257.087, loss/alpha=0.017, loss/cql1=0.565, loss/cql2=0.565, loss/critic1=0.895, loss/critic2=0.873, n/ep=0, n/st=1, q_dataset=257.135, rew=12770.51]                          


Epoch #29: test_reward: 3316.442506 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #30: 501it [00:19, 25.42it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.021, env_step=15000, gradient_step=15000, in_dist=4017.660, len=3979, loss/actor=-257.173, loss/alpha=0.040, loss/cql1=0.566, loss/cql2=0.567, loss/critic1=0.893, loss/critic2=0.912, n/ep=0, n/st=1, q_dataset=257.226, rew=12770.51]                          


Epoch #30: test_reward: 3302.750116 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #31: 501it [00:19, 25.48it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.667, env_step=15500, gradient_step=15500, in_dist=4021.036, len=3979, loss/actor=-256.923, loss/alpha=0.025, loss/cql1=0.564, loss/cql2=0.565, loss/critic1=1.032, loss/critic2=1.045, n/ep=0, n/st=1, q_dataset=256.975, rew=12770.51]                          


Epoch #31: test_reward: 3337.015705 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #32: 501it [00:19, 25.39it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=466.850, env_step=16000, gradient_step=16000, in_dist=4033.033, len=12020, loss/actor=-257.451, loss/alpha=-0.050, loss/cql1=0.562, loss/cql2=0.562, loss/critic1=0.777, loss/critic2=0.773, n/ep=0, n/st=1, q_dataset=257.502, rew=38241.65]                         


Epoch #32: test_reward: 3293.599589 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #33: 501it [00:19, 25.51it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.079, env_step=16500, gradient_step=16500, in_dist=4021.173, len=12020, loss/actor=-257.701, loss/alpha=0.135, loss/cql1=0.565, loss/cql2=0.565, loss/critic1=1.221, loss/critic2=1.228, n/ep=0, n/st=1, q_dataset=257.738, rew=38241.65]                          


Epoch #33: test_reward: 3326.915439 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #34: 501it [00:19, 25.43it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.501, env_step=17000, gradient_step=17000, in_dist=4020.618, len=12020, loss/actor=-257.655, loss/alpha=-0.146, loss/cql1=0.573, loss/cql2=0.574, loss/critic1=0.875, loss/critic2=0.865, n/ep=0, n/st=1, q_dataset=257.703, rew=38241.65]                         


Epoch #34: test_reward: 3307.164672 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #35: 501it [00:19, 25.85it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.769, env_step=17500, gradient_step=17500, in_dist=4031.704, len=12020, loss/actor=-257.707, loss/alpha=0.069, loss/cql1=0.560, loss/cql2=0.562, loss/critic1=0.896, loss/critic2=0.893, n/ep=0, n/st=1, q_dataset=257.758, rew=38241.65]                          


Epoch #35: test_reward: 3338.216314 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #36: 501it [00:19, 25.42it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.169, env_step=18000, gradient_step=18000, in_dist=4036.512, len=12020, loss/actor=-258.461, loss/alpha=-0.017, loss/cql1=0.558, loss/cql2=0.558, loss/critic1=0.737, loss/critic2=0.744, n/ep=0, n/st=1, q_dataset=258.507, rew=38241.65]                         


Epoch #36: test_reward: 2489.993793 ± 0.000000, best_reward: 3341.361087 ± 0.000000 in #15


Epoch #37: 501it [00:19, 25.90it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.370, env_step=18500, gradient_step=18500, in_dist=4035.797, len=12020, loss/actor=-258.639, loss/alpha=-0.122, loss/cql1=0.566, loss/cql2=0.567, loss/critic1=0.884, loss/critic2=0.877, n/ep=0, n/st=1, q_dataset=258.674, rew=38241.65]                         


Epoch #37: test_reward: 3343.539912 ± 0.000000, best_reward: 3343.539912 ± 0.000000 in #37


Epoch #38: 501it [00:19, 25.91it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.080, env_step=19000, gradient_step=19000, in_dist=4018.485, len=12020, loss/actor=-258.159, loss/alpha=-0.067, loss/cql1=0.569, loss/cql2=0.569, loss/critic1=0.632, loss/critic2=0.652, n/ep=0, n/st=1, q_dataset=258.210, rew=38241.65]                         


Epoch #38: test_reward: 3318.017157 ± 0.000000, best_reward: 3343.539912 ± 0.000000 in #37


Epoch #39: 501it [00:19, 25.74it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.300, env_step=19500, gradient_step=19500, in_dist=4015.332, len=12020, loss/actor=-258.980, loss/alpha=-0.070, loss/cql1=0.575, loss/cql2=0.576, loss/critic1=0.832, loss/critic2=0.865, n/ep=0, n/st=1, q_dataset=259.014, rew=38241.65]                         


Epoch #39: test_reward: 3321.656535 ± 0.000000, best_reward: 3343.539912 ± 0.000000 in #37


Epoch #40: 501it [00:19, 25.65it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.153, env_step=20000, gradient_step=20000, in_dist=4028.569, len=12020, loss/actor=-258.935, loss/alpha=-0.027, loss/cql1=0.563, loss/cql2=0.564, loss/critic1=0.830, loss/critic2=0.822, n/ep=0, n/st=1, q_dataset=258.965, rew=38241.65]                         


Epoch #40: test_reward: 3305.495018 ± 0.000000, best_reward: 3343.539912 ± 0.000000 in #37


Epoch #41: 501it [00:19, 25.86it/s, ac_loss=0.144, alpha=0.021, dataset_in_dist=467.219, env_step=20500, gradient_step=20500, in_dist=4028.598, len=12020, loss/actor=-258.724, loss/alpha=-0.013, loss/cql1=0.564, loss/cql2=0.566, loss/critic1=1.137, loss/critic2=1.135, n/ep=0, n/st=1, q_dataset=258.757, rew=38241.65]                         


Epoch #41: test_reward: 3317.691821 ± 0.000000, best_reward: 3343.539912 ± 0.000000 in #37


Epoch #42: 501it [00:19, 25.69it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.493, env_step=21000, gradient_step=21000, in_dist=4015.156, len=12020, loss/actor=-259.422, loss/alpha=0.053, loss/cql1=0.567, loss/cql2=0.568, loss/critic1=0.602, loss/critic2=0.631, n/ep=0, n/st=1, q_dataset=259.467, rew=38241.65]                          


Epoch #42: test_reward: 3068.039864 ± 0.000000, best_reward: 3343.539912 ± 0.000000 in #37


Epoch #43: 501it [00:19, 25.95it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.874, env_step=21500, gradient_step=21500, in_dist=4028.636, len=12020, loss/actor=-259.098, loss/alpha=0.118, loss/cql1=0.563, loss/cql2=0.565, loss/critic1=1.007, loss/critic2=1.012, n/ep=0, n/st=1, q_dataset=259.137, rew=38241.65]                          


Epoch #43: test_reward: 3329.262058 ± 0.000000, best_reward: 3343.539912 ± 0.000000 in #37


Epoch #44: 501it [00:19, 25.84it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.778, env_step=22000, gradient_step=22000, in_dist=4044.946, len=12020, loss/actor=-259.423, loss/alpha=0.089, loss/cql1=0.556, loss/cql2=0.557, loss/critic1=0.853, loss/critic2=0.862, n/ep=0, n/st=1, q_dataset=259.453, rew=38241.65]                          


Epoch #44: test_reward: 3386.169130 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #45: 501it [00:19, 25.83it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.937, env_step=22500, gradient_step=22500, in_dist=4049.673, len=12020, loss/actor=-259.927, loss/alpha=0.171, loss/cql1=0.549, loss/cql2=0.550, loss/critic1=1.159, loss/critic2=1.123, n/ep=0, n/st=1, q_dataset=259.953, rew=38241.65]                          


Epoch #45: test_reward: 3343.834013 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #46: 501it [00:19, 25.10it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.197, env_step=23000, gradient_step=23000, in_dist=4042.665, len=12020, loss/actor=-259.396, loss/alpha=0.055, loss/cql1=0.559, loss/cql2=0.558, loss/critic1=0.787, loss/critic2=0.784, n/ep=0, n/st=1, q_dataset=259.426, rew=38241.65]                          


Epoch #46: test_reward: 1375.247207 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #47: 501it [00:19, 25.32it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.903, env_step=23500, gradient_step=23500, in_dist=4016.415, len=12020, loss/actor=-260.106, loss/alpha=-0.054, loss/cql1=0.580, loss/cql2=0.580, loss/critic1=0.965, loss/critic2=0.960, n/ep=0, n/st=1, q_dataset=260.126, rew=38241.65]                         


Epoch #47: test_reward: 3354.566548 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #48: 501it [00:19, 25.81it/s, ac_loss=0.144, alpha=0.021, dataset_in_dist=467.513, env_step=24000, gradient_step=24000, in_dist=4038.033, len=12020, loss/actor=-260.707, loss/alpha=-0.016, loss/cql1=0.557, loss/cql2=0.558, loss/critic1=0.784, loss/critic2=0.790, n/ep=0, n/st=1, q_dataset=260.744, rew=38241.65]                         


Epoch #48: test_reward: 2841.954248 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #49: 501it [00:19, 25.81it/s, ac_loss=0.149, alpha=0.021, dataset_in_dist=466.473, env_step=24500, gradient_step=24500, in_dist=4064.769, len=12020, loss/actor=-260.664, loss/alpha=0.025, loss/cql1=0.543, loss/cql2=0.544, loss/critic1=0.939, loss/critic2=0.950, n/ep=0, n/st=1, q_dataset=260.706, rew=38241.65]                          


Epoch #49: test_reward: 3341.594367 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #50: 501it [00:19, 25.95it/s, ac_loss=0.144, alpha=0.020, dataset_in_dist=467.215, env_step=25000, gradient_step=25000, in_dist=4063.421, len=12020, loss/actor=-260.709, loss/alpha=-0.013, loss/cql1=0.548, loss/cql2=0.549, loss/critic1=0.939, loss/critic2=0.893, n/ep=0, n/st=1, q_dataset=260.741, rew=38241.65]                         


Epoch #50: test_reward: 3321.025541 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #51: 501it [00:19, 25.90it/s, ac_loss=0.145, alpha=0.020, dataset_in_dist=466.959, env_step=25500, gradient_step=25500, in_dist=4050.266, len=12020, loss/actor=-260.726, loss/alpha=-0.037, loss/cql1=0.556, loss/cql2=0.555, loss/critic1=1.170, loss/critic2=1.170, n/ep=0, n/st=1, q_dataset=260.761, rew=38241.65]                         


Epoch #51: test_reward: 3341.565135 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #52: 501it [00:19, 25.39it/s, ac_loss=0.147, alpha=0.020, dataset_in_dist=466.600, env_step=26000, gradient_step=26000, in_dist=4062.610, len=12020, loss/actor=-260.291, loss/alpha=-0.010, loss/cql1=0.547, loss/cql2=0.547, loss/critic1=1.203, loss/critic2=1.192, n/ep=0, n/st=1, q_dataset=260.322, rew=38241.65]                         


Epoch #52: test_reward: 3342.627401 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #53: 501it [00:19, 25.27it/s, ac_loss=0.147, alpha=0.020, dataset_in_dist=466.703, env_step=26500, gradient_step=26500, in_dist=4069.676, len=12020, loss/actor=-260.050, loss/alpha=0.001, loss/cql1=0.543, loss/cql2=0.545, loss/critic1=0.783, loss/critic2=0.751, n/ep=0, n/st=1, q_dataset=260.093, rew=38241.65]                          


Epoch #53: test_reward: 1492.540540 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #54: 501it [00:19, 25.43it/s, ac_loss=0.146, alpha=0.020, dataset_in_dist=467.043, env_step=27000, gradient_step=27000, in_dist=4078.824, len=12020, loss/actor=-260.636, loss/alpha=0.079, loss/cql1=0.534, loss/cql2=0.535, loss/critic1=1.260, loss/critic2=1.218, n/ep=0, n/st=1, q_dataset=260.683, rew=38241.65]                          


Epoch #54: test_reward: 2264.157316 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #55: 501it [00:19, 25.53it/s, ac_loss=0.147, alpha=0.020, dataset_in_dist=466.654, env_step=27500, gradient_step=27500, in_dist=4068.528, len=12020, loss/actor=-261.283, loss/alpha=0.214, loss/cql1=0.540, loss/cql2=0.540, loss/critic1=0.763, loss/critic2=0.765, n/ep=0, n/st=1, q_dataset=261.313, rew=38241.65]                          


Epoch #55: test_reward: 1335.382725 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #56: 501it [00:19, 25.54it/s, ac_loss=0.143, alpha=0.020, dataset_in_dist=467.413, env_step=28000, gradient_step=28000, in_dist=4044.246, len=11889, loss/actor=-261.280, loss/alpha=0.103, loss/cql1=0.555, loss/cql2=0.555, loss/critic1=0.806, loss/critic2=0.763, n/ep=0, n/st=1, q_dataset=261.316, rew=38298.76]                          


Epoch #56: test_reward: 3309.720924 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #57: 501it [00:19, 25.66it/s, ac_loss=0.147, alpha=0.020, dataset_in_dist=466.768, env_step=28500, gradient_step=28500, in_dist=4050.495, len=11889, loss/actor=-260.966, loss/alpha=-0.030, loss/cql1=0.555, loss/cql2=0.555, loss/critic1=0.919, loss/critic2=0.908, n/ep=0, n/st=1, q_dataset=260.992, rew=38298.76]                         


Epoch #57: test_reward: 3348.735849 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #58: 501it [00:19, 25.86it/s, ac_loss=0.144, alpha=0.020, dataset_in_dist=467.346, env_step=29000, gradient_step=29000, in_dist=4048.985, len=11889, loss/actor=-260.918, loss/alpha=-0.013, loss/cql1=0.558, loss/cql2=0.557, loss/critic1=1.025, loss/critic2=1.035, n/ep=0, n/st=1, q_dataset=260.934, rew=38298.76]                         


Epoch #58: test_reward: 3297.755711 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #59: 501it [00:19, 25.76it/s, ac_loss=0.148, alpha=0.020, dataset_in_dist=466.668, env_step=29500, gradient_step=29500, in_dist=4047.167, len=11889, loss/actor=-261.264, loss/alpha=0.047, loss/cql1=0.554, loss/cql2=0.554, loss/critic1=0.628, loss/critic2=0.633, n/ep=0, n/st=1, q_dataset=261.292, rew=38298.76]                          


Epoch #59: test_reward: 3319.819019 ± 0.000000, best_reward: 3386.169130 ± 0.000000 in #44


Epoch #60: 501it [00:19, 25.72it/s, ac_loss=0.147, alpha=0.020, dataset_in_dist=466.602, env_step=30000, gradient_step=30000, in_dist=4025.341, len=11889, loss/actor=-261.825, loss/alpha=0.063, loss/cql1=0.566, loss/cql2=0.566, loss/critic1=0.907, loss/critic2=0.895, n/ep=0, n/st=1, q_dataset=261.851, rew=38298.76]                          


Epoch #60: test_reward: 3408.019040 ± 0.000000, best_reward: 3408.019040 ± 0.000000 in #60


Epoch #61: 501it [00:19, 25.66it/s, ac_loss=0.146, alpha=0.020, dataset_in_dist=466.714, env_step=30500, gradient_step=30500, in_dist=4058.751, len=11889, loss/actor=-261.739, loss/alpha=0.099, loss/cql1=0.548, loss/cql2=0.548, loss/critic1=1.146, loss/critic2=1.118, n/ep=0, n/st=1, q_dataset=261.757, rew=38298.76]                          


Epoch #61: test_reward: 3399.702625 ± 0.000000, best_reward: 3408.019040 ± 0.000000 in #60


Epoch #62: 501it [00:19, 25.73it/s, ac_loss=0.145, alpha=0.020, dataset_in_dist=467.186, env_step=31000, gradient_step=31000, in_dist=4025.823, len=11889, loss/actor=-261.426, loss/alpha=0.038, loss/cql1=0.565, loss/cql2=0.566, loss/critic1=1.121, loss/critic2=1.144, n/ep=0, n/st=1, q_dataset=261.451, rew=38298.76]                          


Epoch #62: test_reward: 2215.363640 ± 0.000000, best_reward: 3408.019040 ± 0.000000 in #60


Epoch #63: 501it [00:19, 25.70it/s, ac_loss=0.145, alpha=0.020, dataset_in_dist=467.180, env_step=31500, gradient_step=31500, in_dist=4054.840, len=3570, loss/actor=-261.669, loss/alpha=0.121, loss/cql1=0.553, loss/cql2=0.554, loss/critic1=1.032, loss/critic2=1.067, n/ep=0, n/st=1, q_dataset=261.676, rew=11546.17]                           


Epoch #63: test_reward: 3355.534643 ± 0.000000, best_reward: 3408.019040 ± 0.000000 in #60


Epoch #64: 501it [00:19, 25.77it/s, ac_loss=0.146, alpha=0.020, dataset_in_dist=466.916, env_step=32000, gradient_step=32000, in_dist=4026.987, len=3570, loss/actor=-262.020, loss/alpha=0.108, loss/cql1=0.566, loss/cql2=0.566, loss/critic1=0.992, loss/critic2=1.016, n/ep=0, n/st=1, q_dataset=262.039, rew=11546.17]                          


Epoch #64: test_reward: 1934.784045 ± 0.000000, best_reward: 3408.019040 ± 0.000000 in #60


Epoch #65: 501it [00:19, 25.79it/s, ac_loss=0.144, alpha=0.020, dataset_in_dist=467.413, env_step=32500, gradient_step=32500, in_dist=4055.579, len=3570, loss/actor=-261.880, loss/alpha=-0.137, loss/cql1=0.559, loss/cql2=0.559, loss/critic1=0.902, loss/critic2=0.882, n/ep=0, n/st=1, q_dataset=261.898, rew=11546.17]                         


Epoch #65: test_reward: 3330.378433 ± 0.000000, best_reward: 3408.019040 ± 0.000000 in #60


Epoch #66: 501it [00:19, 25.78it/s, ac_loss=0.148, alpha=0.020, dataset_in_dist=466.519, env_step=33000, gradient_step=33000, in_dist=4024.131, len=3570, loss/actor=-262.028, loss/alpha=-0.070, loss/cql1=0.573, loss/cql2=0.572, loss/critic1=0.794, loss/critic2=0.777, n/ep=0, n/st=1, q_dataset=262.055, rew=11546.17]                         


Epoch #66: test_reward: 3322.092442 ± 0.000000, best_reward: 3408.019040 ± 0.000000 in #60


Epoch #67: 501it [00:19, 25.84it/s, ac_loss=0.145, alpha=0.020, dataset_in_dist=467.280, env_step=33500, gradient_step=33500, in_dist=4045.549, len=3570, loss/actor=-262.273, loss/alpha=0.065, loss/cql1=0.554, loss/cql2=0.555, loss/critic1=0.845, loss/critic2=0.818, n/ep=0, n/st=1, q_dataset=262.292, rew=11546.17]                          


Epoch #67: test_reward: 3340.163344 ± 0.000000, best_reward: 3408.019040 ± 0.000000 in #60


Epoch #68: 501it [00:19, 25.87it/s, ac_loss=0.146, alpha=0.020, dataset_in_dist=466.819, env_step=34000, gradient_step=34000, in_dist=4056.560, len=3570, loss/actor=-262.860, loss/alpha=-0.005, loss/cql1=0.553, loss/cql2=0.552, loss/critic1=0.896, loss/critic2=0.882, n/ep=0, n/st=1, q_dataset=262.880, rew=11546.17]                         


Epoch #68: test_reward: 3426.092740 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #69: 501it [00:19, 25.70it/s, ac_loss=0.144, alpha=0.020, dataset_in_dist=467.445, env_step=34500, gradient_step=34500, in_dist=4057.847, len=3032, loss/actor=-262.448, loss/alpha=0.144, loss/cql1=0.544, loss/cql2=0.545, loss/critic1=1.120, loss/critic2=1.099, n/ep=0, n/st=1, q_dataset=262.476, rew=9820.14]                           


Epoch #69: test_reward: 3351.880069 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #70: 501it [00:19, 25.92it/s, ac_loss=0.144, alpha=0.020, dataset_in_dist=467.640, env_step=35000, gradient_step=35000, in_dist=4079.826, len=3032, loss/actor=-262.452, loss/alpha=-0.044, loss/cql1=0.539, loss/cql2=0.540, loss/critic1=1.032, loss/critic2=1.031, n/ep=0, n/st=1, q_dataset=262.475, rew=9820.14]                         


Epoch #70: test_reward: 3333.387724 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #71: 501it [00:19, 25.90it/s, ac_loss=0.146, alpha=0.020, dataset_in_dist=466.995, env_step=35500, gradient_step=35500, in_dist=4024.688, len=3032, loss/actor=-262.581, loss/alpha=-0.175, loss/cql1=0.574, loss/cql2=0.575, loss/critic1=0.746, loss/critic2=0.735, n/ep=0, n/st=1, q_dataset=262.615, rew=9820.14]                         


Epoch #71: test_reward: 3347.693346 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #72: 501it [00:19, 25.94it/s, ac_loss=0.145, alpha=0.020, dataset_in_dist=467.433, env_step=36000, gradient_step=36000, in_dist=4043.107, len=3032, loss/actor=-262.809, loss/alpha=-0.146, loss/cql1=0.562, loss/cql2=0.563, loss/critic1=0.980, loss/critic2=1.013, n/ep=0, n/st=1, q_dataset=262.828, rew=9820.14]                         


Epoch #72: test_reward: 3364.229527 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #73: 501it [00:19, 25.92it/s, ac_loss=0.142, alpha=0.020, dataset_in_dist=468.044, env_step=36500, gradient_step=36500, in_dist=4039.661, len=3032, loss/actor=-263.154, loss/alpha=-0.120, loss/cql1=0.565, loss/cql2=0.565, loss/critic1=1.039, loss/critic2=1.051, n/ep=0, n/st=1, q_dataset=263.163, rew=9820.14]                         


Epoch #73: test_reward: 3346.281260 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #74: 501it [00:19, 25.86it/s, ac_loss=0.145, alpha=0.020, dataset_in_dist=467.129, env_step=37000, gradient_step=37000, in_dist=4062.171, len=3032, loss/actor=-263.329, loss/alpha=0.096, loss/cql1=0.541, loss/cql2=0.540, loss/critic1=0.833, loss/critic2=0.822, n/ep=0, n/st=1, q_dataset=263.344, rew=9820.14]                          


Epoch #74: test_reward: 3360.922654 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #75: 501it [00:19, 25.68it/s, ac_loss=0.145, alpha=0.020, dataset_in_dist=467.203, env_step=37500, gradient_step=37500, in_dist=4050.508, len=2961, loss/actor=-263.624, loss/alpha=-0.001, loss/cql1=0.555, loss/cql2=0.555, loss/critic1=0.784, loss/critic2=0.785, n/ep=0, n/st=1, q_dataset=263.634, rew=9648.83]                         


Epoch #75: test_reward: 1945.793941 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #76: 501it [00:19, 25.76it/s, ac_loss=0.147, alpha=0.020, dataset_in_dist=466.999, env_step=38000, gradient_step=38000, in_dist=4043.885, len=2961, loss/actor=-263.602, loss/alpha=0.046, loss/cql1=0.556, loss/cql2=0.556, loss/critic1=0.784, loss/critic2=0.776, n/ep=0, n/st=1, q_dataset=263.626, rew=9648.83]                         


Epoch #76: test_reward: 3420.357616 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #77: 501it [00:19, 25.75it/s, ac_loss=0.143, alpha=0.020, dataset_in_dist=468.155, env_step=38500, gradient_step=38500, in_dist=4053.903, len=2961, loss/actor=-263.721, loss/alpha=-0.050, loss/cql1=0.553, loss/cql2=0.553, loss/critic1=1.233, loss/critic2=1.232, n/ep=0, n/st=1, q_dataset=263.742, rew=9648.83]                         


Epoch #77: test_reward: 1867.930458 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #78: 501it [00:19, 25.92it/s, ac_loss=0.145, alpha=0.020, dataset_in_dist=467.370, env_step=39000, gradient_step=39000, in_dist=4050.653, len=2961, loss/actor=-263.845, loss/alpha=-0.040, loss/cql1=0.559, loss/cql2=0.558, loss/critic1=0.760, loss/critic2=0.768, n/ep=0, n/st=1, q_dataset=263.849, rew=9648.83]                         


Epoch #78: test_reward: 3338.304031 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #79: 501it [00:19, 25.85it/s, ac_loss=0.144, alpha=0.020, dataset_in_dist=467.370, env_step=39500, gradient_step=39500, in_dist=4042.553, len=2961, loss/actor=-263.981, loss/alpha=-0.004, loss/cql1=0.558, loss/cql2=0.558, loss/critic1=0.829, loss/critic2=0.826, n/ep=0, n/st=1, q_dataset=263.999, rew=9648.83]                         


Epoch #79: test_reward: 3350.044179 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #80: 501it [00:20, 24.91it/s, ac_loss=0.147, alpha=0.020, dataset_in_dist=466.569, env_step=40000, gradient_step=40000, in_dist=4064.099, len=2961, loss/actor=-263.930, loss/alpha=0.151, loss/cql1=0.542, loss/cql2=0.542, loss/critic1=1.235, loss/critic2=1.230, n/ep=0, n/st=1, q_dataset=263.938, rew=9648.83]                          


Epoch #80: test_reward: 3343.659443 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #81: 501it [00:20, 24.58it/s, ac_loss=0.146, alpha=0.020, dataset_in_dist=467.110, env_step=40500, gradient_step=40500, in_dist=4023.533, len=2961, loss/actor=-263.931, loss/alpha=-0.121, loss/cql1=0.576, loss/cql2=0.577, loss/critic1=1.284, loss/critic2=1.306, n/ep=0, n/st=1, q_dataset=263.933, rew=9648.83]                         


Epoch #81: test_reward: 3371.501238 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #82: 501it [00:19, 25.16it/s, ac_loss=0.145, alpha=0.020, dataset_in_dist=467.424, env_step=41000, gradient_step=41000, in_dist=4058.556, len=2961, loss/actor=-264.898, loss/alpha=-0.022, loss/cql1=0.552, loss/cql2=0.553, loss/critic1=1.397, loss/critic2=1.377, n/ep=0, n/st=1, q_dataset=264.906, rew=9648.83]                         


Epoch #82: test_reward: 3344.017623 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #83: 501it [00:19, 25.56it/s, ac_loss=0.147, alpha=0.020, dataset_in_dist=467.131, env_step=41500, gradient_step=41500, in_dist=4058.524, len=2961, loss/actor=-264.294, loss/alpha=0.175, loss/cql1=0.546, loss/cql2=0.546, loss/critic1=1.068, loss/critic2=1.045, n/ep=0, n/st=1, q_dataset=264.303, rew=9648.83]                          


Epoch #83: test_reward: 3087.211831 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #84: 501it [00:19, 25.33it/s, ac_loss=0.145, alpha=0.020, dataset_in_dist=467.462, env_step=42000, gradient_step=42000, in_dist=4055.806, len=2961, loss/actor=-264.425, loss/alpha=-0.007, loss/cql1=0.556, loss/cql2=0.556, loss/critic1=1.321, loss/critic2=1.342, n/ep=0, n/st=1, q_dataset=264.432, rew=9648.83]                         


Epoch #84: test_reward: 3356.376158 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #85: 501it [00:19, 25.42it/s, ac_loss=0.145, alpha=0.020, dataset_in_dist=467.304, env_step=42500, gradient_step=42500, in_dist=4036.969, len=2961, loss/actor=-264.015, loss/alpha=-0.077, loss/cql1=0.568, loss/cql2=0.569, loss/critic1=0.979, loss/critic2=0.950, n/ep=0, n/st=1, q_dataset=264.015, rew=9648.83]                         


Epoch #85: test_reward: 3410.736932 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #86: 501it [00:19, 25.62it/s, ac_loss=0.146, alpha=0.020, dataset_in_dist=466.918, env_step=43000, gradient_step=43000, in_dist=4049.343, len=2961, loss/actor=-264.711, loss/alpha=-0.018, loss/cql1=0.557, loss/cql2=0.556, loss/critic1=1.071, loss/critic2=1.053, n/ep=0, n/st=1, q_dataset=264.721, rew=9648.83]                         


Epoch #86: test_reward: 3358.345085 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #87: 501it [00:19, 25.54it/s, ac_loss=0.146, alpha=0.020, dataset_in_dist=466.973, env_step=43500, gradient_step=43500, in_dist=4054.919, len=2961, loss/actor=-264.954, loss/alpha=-0.065, loss/cql1=0.558, loss/cql2=0.558, loss/critic1=1.002, loss/critic2=1.006, n/ep=0, n/st=1, q_dataset=264.956, rew=9648.83]                         


Epoch #87: test_reward: 2821.690538 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #88: 501it [00:19, 25.36it/s, ac_loss=0.148, alpha=0.020, dataset_in_dist=466.538, env_step=44000, gradient_step=44000, in_dist=4041.435, len=2961, loss/actor=-264.855, loss/alpha=0.024, loss/cql1=0.561, loss/cql2=0.562, loss/critic1=1.634, loss/critic2=1.606, n/ep=0, n/st=1, q_dataset=264.866, rew=9648.83]                          


Epoch #88: test_reward: 2187.605292 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #89: 501it [00:20, 24.72it/s, ac_loss=0.144, alpha=0.020, dataset_in_dist=467.549, env_step=44500, gradient_step=44500, in_dist=4033.108, len=2961, loss/actor=-265.001, loss/alpha=0.065, loss/cql1=0.569, loss/cql2=0.570, loss/critic1=1.500, loss/critic2=1.491, n/ep=0, n/st=1, q_dataset=265.009, rew=9648.83]                          


Epoch #89: test_reward: 3358.978655 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #90: 501it [00:19, 25.49it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.029, env_step=45000, gradient_step=45000, in_dist=4064.223, len=2961, loss/actor=-264.900, loss/alpha=0.174, loss/cql1=0.545, loss/cql2=0.545, loss/critic1=1.762, loss/critic2=1.757, n/ep=0, n/st=1, q_dataset=264.899, rew=9648.83]                          


Epoch #90: test_reward: 3372.040505 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #91: 501it [00:19, 25.48it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.698, env_step=45500, gradient_step=45500, in_dist=4059.292, len=2961, loss/actor=-264.964, loss/alpha=0.070, loss/cql1=0.549, loss/cql2=0.548, loss/critic1=1.421, loss/critic2=1.426, n/ep=0, n/st=1, q_dataset=264.959, rew=9648.83]                          


Epoch #91: test_reward: 3336.119897 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #92: 501it [00:19, 25.17it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.405, env_step=46000, gradient_step=46000, in_dist=4041.052, len=2961, loss/actor=-265.005, loss/alpha=0.000, loss/cql1=0.564, loss/cql2=0.564, loss/critic1=1.507, loss/critic2=1.449, n/ep=0, n/st=1, q_dataset=265.001, rew=9648.83]                          


Epoch #92: test_reward: 3366.315231 ± 0.000000, best_reward: 3426.092740 ± 0.000000 in #68


Epoch #93: 501it [00:19, 25.49it/s, ac_loss=0.143, alpha=0.021, dataset_in_dist=467.684, env_step=46500, gradient_step=46500, in_dist=4052.436, len=2961, loss/actor=-265.002, loss/alpha=-0.033, loss/cql1=0.557, loss/cql2=0.558, loss/critic1=1.181, loss/critic2=1.145, n/ep=0, n/st=1, q_dataset=265.007, rew=9648.83]                         


Epoch #93: test_reward: 3495.840031 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #94: 501it [00:19, 25.15it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.877, env_step=47000, gradient_step=47000, in_dist=4042.721, len=2961, loss/actor=-265.331, loss/alpha=-0.070, loss/cql1=0.562, loss/cql2=0.561, loss/critic1=1.148, loss/critic2=1.148, n/ep=0, n/st=1, q_dataset=265.342, rew=9648.83]                         


Epoch #94: test_reward: 3365.245251 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #95: 501it [00:19, 25.88it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=467.038, env_step=47500, gradient_step=47500, in_dist=4051.214, len=2961, loss/actor=-265.787, loss/alpha=0.069, loss/cql1=0.555, loss/cql2=0.555, loss/critic1=1.641, loss/critic2=1.652, n/ep=0, n/st=1, q_dataset=265.786, rew=9648.83]                          


Epoch #95: test_reward: 2834.401790 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #96: 501it [00:19, 25.87it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.649, env_step=48000, gradient_step=48000, in_dist=4015.129, len=2961, loss/actor=-265.165, loss/alpha=-0.101, loss/cql1=0.580, loss/cql2=0.580, loss/critic1=1.154, loss/critic2=1.156, n/ep=0, n/st=1, q_dataset=265.151, rew=9648.83]                         


Epoch #96: test_reward: 3358.655453 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #97: 501it [00:19, 25.80it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.024, env_step=48500, gradient_step=48500, in_dist=4023.856, len=2961, loss/actor=-265.477, loss/alpha=0.031, loss/cql1=0.568, loss/cql2=0.566, loss/critic1=0.849, loss/critic2=0.845, n/ep=0, n/st=1, q_dataset=265.493, rew=9648.83]                          


Epoch #97: test_reward: 3350.574134 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #98: 501it [00:19, 25.58it/s, ac_loss=0.143, alpha=0.021, dataset_in_dist=467.419, env_step=49000, gradient_step=49000, in_dist=4046.557, len=2961, loss/actor=-265.865, loss/alpha=0.132, loss/cql1=0.556, loss/cql2=0.557, loss/critic1=0.978, loss/critic2=0.984, n/ep=0, n/st=1, q_dataset=265.849, rew=9648.83]                          


Epoch #98: test_reward: 3333.661331 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #99: 501it [00:19, 25.33it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.401, env_step=49500, gradient_step=49500, in_dist=4039.818, len=2961, loss/actor=-265.780, loss/alpha=-0.043, loss/cql1=0.568, loss/cql2=0.569, loss/critic1=0.743, loss/critic2=0.721, n/ep=0, n/st=1, q_dataset=265.763, rew=9648.83]                         


Epoch #99: test_reward: 3366.736405 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #100: 501it [00:19, 25.82it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.678, env_step=50000, gradient_step=50000, in_dist=4034.933, len=2961, loss/actor=-266.212, loss/alpha=-0.126, loss/cql1=0.573, loss/cql2=0.573, loss/critic1=1.161, loss/critic2=1.116, n/ep=0, n/st=1, q_dataset=266.207, rew=9648.83]                         


Epoch #100: test_reward: 3345.638088 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #101: 501it [00:19, 25.71it/s, ac_loss=0.147, alpha=0.020, dataset_in_dist=466.631, env_step=50500, gradient_step=50500, in_dist=4046.841, len=2961, loss/actor=-265.780, loss/alpha=-0.140, loss/cql1=0.566, loss/cql2=0.567, loss/critic1=1.861, loss/critic2=1.853, n/ep=0, n/st=1, q_dataset=265.784, rew=9648.83]                         


Epoch #101: test_reward: 3363.810660 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #102: 501it [00:19, 25.28it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.516, env_step=51000, gradient_step=51000, in_dist=4039.697, len=2961, loss/actor=-265.811, loss/alpha=0.005, loss/cql1=0.564, loss/cql2=0.564, loss/critic1=0.886, loss/critic2=0.885, n/ep=0, n/st=1, q_dataset=265.789, rew=9648.83]                          


Epoch #102: test_reward: 3350.076637 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #103: 501it [00:19, 25.35it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.893, env_step=51500, gradient_step=51500, in_dist=4038.575, len=2961, loss/actor=-266.103, loss/alpha=0.005, loss/cql1=0.563, loss/cql2=0.564, loss/critic1=0.870, loss/critic2=0.878, n/ep=0, n/st=1, q_dataset=266.091, rew=9648.83]                          


Epoch #103: test_reward: 3359.081073 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #104: 501it [00:19, 25.32it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.585, env_step=52000, gradient_step=52000, in_dist=4023.121, len=2961, loss/actor=-266.349, loss/alpha=-0.045, loss/cql1=0.573, loss/cql2=0.573, loss/critic1=0.851, loss/critic2=0.853, n/ep=0, n/st=1, q_dataset=266.343, rew=9648.83]                         


Epoch #104: test_reward: 2548.338608 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #105: 501it [00:20, 24.82it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.662, env_step=52500, gradient_step=52500, in_dist=4059.345, len=2961, loss/actor=-266.142, loss/alpha=0.014, loss/cql1=0.554, loss/cql2=0.553, loss/critic1=1.312, loss/critic2=1.319, n/ep=0, n/st=1, q_dataset=266.124, rew=9648.83]                          


Epoch #105: test_reward: 3350.195876 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #106: 501it [00:19, 25.12it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.034, env_step=53000, gradient_step=53000, in_dist=4062.777, len=2961, loss/actor=-266.519, loss/alpha=0.188, loss/cql1=0.545, loss/cql2=0.545, loss/critic1=1.374, loss/critic2=1.371, n/ep=0, n/st=1, q_dataset=266.517, rew=9648.83]                          


Epoch #106: test_reward: 3124.432973 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #107: 501it [00:19, 25.70it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.669, env_step=53500, gradient_step=53500, in_dist=4033.188, len=2961, loss/actor=-266.531, loss/alpha=-0.016, loss/cql1=0.566, loss/cql2=0.565, loss/critic1=1.341, loss/critic2=1.319, n/ep=0, n/st=1, q_dataset=266.530, rew=9648.83]                         


Epoch #107: test_reward: 3367.080920 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #108: 501it [00:19, 25.81it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.988, env_step=54000, gradient_step=54000, in_dist=4083.500, len=2961, loss/actor=-266.811, loss/alpha=0.138, loss/cql1=0.533, loss/cql2=0.533, loss/critic1=1.086, loss/critic2=1.087, n/ep=0, n/st=1, q_dataset=266.808, rew=9648.83]                          


Epoch #108: test_reward: 3360.034894 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #109: 501it [00:19, 25.65it/s, ac_loss=0.149, alpha=0.021, dataset_in_dist=466.027, env_step=54500, gradient_step=54500, in_dist=4030.178, len=2961, loss/actor=-266.502, loss/alpha=0.088, loss/cql1=0.563, loss/cql2=0.563, loss/critic1=1.283, loss/critic2=1.280, n/ep=0, n/st=1, q_dataset=266.507, rew=9648.83]                          


Epoch #109: test_reward: 3354.597457 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #110: 501it [00:19, 25.91it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.557, env_step=55000, gradient_step=55000, in_dist=4037.229, len=2961, loss/actor=-266.685, loss/alpha=-0.015, loss/cql1=0.566, loss/cql2=0.565, loss/critic1=0.696, loss/critic2=0.699, n/ep=0, n/st=1, q_dataset=266.670, rew=9648.83]                         


Epoch #110: test_reward: 3377.314785 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #111: 501it [00:19, 25.93it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.405, env_step=55500, gradient_step=55500, in_dist=4043.112, len=2961, loss/actor=-266.766, loss/alpha=-0.050, loss/cql1=0.562, loss/cql2=0.563, loss/critic1=1.124, loss/critic2=1.084, n/ep=0, n/st=1, q_dataset=266.762, rew=9648.83]                         


Epoch #111: test_reward: 3326.620678 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #112: 501it [00:19, 25.77it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=466.965, env_step=56000, gradient_step=56000, in_dist=4044.353, len=2961, loss/actor=-267.332, loss/alpha=-0.007, loss/cql1=0.561, loss/cql2=0.562, loss/critic1=1.329, loss/critic2=1.357, n/ep=0, n/st=1, q_dataset=267.314, rew=9648.83]                         


Epoch #112: test_reward: 3353.175652 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #113: 501it [00:19, 25.49it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.931, env_step=56500, gradient_step=56500, in_dist=4051.700, len=2961, loss/actor=-266.910, loss/alpha=-0.110, loss/cql1=0.560, loss/cql2=0.561, loss/critic1=1.534, loss/critic2=1.519, n/ep=0, n/st=1, q_dataset=266.902, rew=9648.83]                         


Epoch #113: test_reward: 3370.170652 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #114: 501it [00:19, 25.82it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.550, env_step=57000, gradient_step=57000, in_dist=4061.276, len=2961, loss/actor=-267.499, loss/alpha=0.213, loss/cql1=0.542, loss/cql2=0.543, loss/critic1=2.260, loss/critic2=2.242, n/ep=0, n/st=1, q_dataset=267.492, rew=9648.83]                          


Epoch #114: test_reward: 3348.202778 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #115: 501it [00:19, 25.84it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.797, env_step=57500, gradient_step=57500, in_dist=4035.851, len=2961, loss/actor=-267.233, loss/alpha=-0.049, loss/cql1=0.570, loss/cql2=0.571, loss/critic1=1.075, loss/critic2=1.057, n/ep=0, n/st=1, q_dataset=267.215, rew=9648.83]                         


Epoch #115: test_reward: 3341.857346 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #116: 501it [00:19, 25.41it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.489, env_step=58000, gradient_step=58000, in_dist=4029.461, len=20415, loss/actor=-267.420, loss/alpha=-0.101, loss/cql1=0.575, loss/cql2=0.575, loss/critic1=1.485, loss/critic2=1.492, n/ep=0, n/st=1, q_dataset=267.406, rew=66236.97]                         


Epoch #116: test_reward: 3394.199153 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #117: 501it [00:19, 25.76it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.259, env_step=58500, gradient_step=58500, in_dist=4049.136, len=20415, loss/actor=-267.706, loss/alpha=0.006, loss/cql1=0.561, loss/cql2=0.561, loss/critic1=0.991, loss/critic2=0.951, n/ep=0, n/st=1, q_dataset=267.696, rew=66236.97]                          


Epoch #117: test_reward: 3390.523432 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #118: 501it [00:19, 25.47it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.647, env_step=59000, gradient_step=59000, in_dist=4043.171, len=20415, loss/actor=-267.697, loss/alpha=-0.092, loss/cql1=0.564, loss/cql2=0.564, loss/critic1=0.539, loss/critic2=0.531, n/ep=0, n/st=1, q_dataset=267.695, rew=66236.97]                         


Epoch #118: test_reward: 3372.996890 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #119: 501it [00:19, 25.37it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.556, env_step=59500, gradient_step=59500, in_dist=4038.981, len=20415, loss/actor=-267.588, loss/alpha=-0.080, loss/cql1=0.568, loss/cql2=0.568, loss/critic1=1.743, loss/critic2=1.721, n/ep=0, n/st=1, q_dataset=267.584, rew=66236.97]                         


Epoch #119: test_reward: 3373.800269 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #120: 501it [00:19, 25.68it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.865, env_step=60000, gradient_step=60000, in_dist=4035.943, len=20415, loss/actor=-267.557, loss/alpha=0.029, loss/cql1=0.563, loss/cql2=0.564, loss/critic1=1.430, loss/critic2=1.424, n/ep=0, n/st=1, q_dataset=267.544, rew=66236.97]                          


Epoch #120: test_reward: 3362.403725 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #121: 501it [00:19, 25.08it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.068, env_step=60500, gradient_step=60500, in_dist=4040.502, len=20415, loss/actor=-267.741, loss/alpha=-0.267, loss/cql1=0.573, loss/cql2=0.573, loss/critic1=1.000, loss/critic2=1.019, n/ep=0, n/st=1, q_dataset=267.731, rew=66236.97]                         


Epoch #121: test_reward: 3351.330408 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #122: 501it [00:19, 25.12it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.940, env_step=61000, gradient_step=61000, in_dist=4059.152, len=20415, loss/actor=-268.165, loss/alpha=0.077, loss/cql1=0.549, loss/cql2=0.550, loss/critic1=1.460, loss/critic2=1.445, n/ep=0, n/st=1, q_dataset=268.169, rew=66236.97]                          


Epoch #122: test_reward: 3330.531732 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #123: 501it [00:19, 25.51it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.029, env_step=61500, gradient_step=61500, in_dist=4053.318, len=20415, loss/actor=-267.650, loss/alpha=0.105, loss/cql1=0.556, loss/cql2=0.557, loss/critic1=1.221, loss/critic2=1.223, n/ep=0, n/st=1, q_dataset=267.627, rew=66236.97]                          


Epoch #123: test_reward: 2873.346991 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #124: 501it [00:19, 25.72it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.555, env_step=62000, gradient_step=62000, in_dist=4040.491, len=20415, loss/actor=-267.990, loss/alpha=-0.058, loss/cql1=0.564, loss/cql2=0.566, loss/critic1=1.458, loss/critic2=1.442, n/ep=0, n/st=1, q_dataset=267.987, rew=66236.97]                         


Epoch #124: test_reward: 3429.690782 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #125: 501it [00:19, 25.71it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.255, env_step=62500, gradient_step=62500, in_dist=4043.552, len=20415, loss/actor=-267.984, loss/alpha=0.043, loss/cql1=0.562, loss/cql2=0.563, loss/critic1=1.894, loss/critic2=1.880, n/ep=0, n/st=1, q_dataset=267.971, rew=66236.97]                          


Epoch #125: test_reward: 3346.358893 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #126: 501it [00:19, 25.71it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.734, env_step=63000, gradient_step=63000, in_dist=4040.916, len=20415, loss/actor=-268.105, loss/alpha=0.110, loss/cql1=0.563, loss/cql2=0.564, loss/critic1=1.206, loss/critic2=1.213, n/ep=0, n/st=1, q_dataset=268.074, rew=66236.97]                          


Epoch #126: test_reward: 3373.572136 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #127: 501it [00:19, 25.46it/s, ac_loss=0.144, alpha=0.021, dataset_in_dist=467.710, env_step=63500, gradient_step=63500, in_dist=4040.664, len=20415, loss/actor=-268.502, loss/alpha=0.031, loss/cql1=0.563, loss/cql2=0.564, loss/critic1=1.453, loss/critic2=1.470, n/ep=0, n/st=1, q_dataset=268.469, rew=66236.97]                          


Epoch #127: test_reward: 3382.715615 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #128: 501it [00:19, 25.78it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.032, env_step=64000, gradient_step=64000, in_dist=4025.159, len=20415, loss/actor=-268.603, loss/alpha=0.230, loss/cql1=0.566, loss/cql2=0.565, loss/critic1=1.129, loss/critic2=1.107, n/ep=0, n/st=1, q_dataset=268.576, rew=66236.97]                          


Epoch #128: test_reward: 3359.647328 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #129: 501it [00:19, 25.79it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.353, env_step=64500, gradient_step=64500, in_dist=4024.266, len=20415, loss/actor=-268.415, loss/alpha=-0.070, loss/cql1=0.574, loss/cql2=0.575, loss/critic1=1.179, loss/critic2=1.183, n/ep=0, n/st=1, q_dataset=268.402, rew=66236.97]                         


Epoch #129: test_reward: 3340.630039 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #130: 501it [00:19, 25.81it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.068, env_step=65000, gradient_step=65000, in_dist=4022.063, len=20415, loss/actor=-267.884, loss/alpha=-0.091, loss/cql1=0.576, loss/cql2=0.577, loss/critic1=1.766, loss/critic2=1.764, n/ep=0, n/st=1, q_dataset=267.870, rew=66236.97]                         


Epoch #130: test_reward: 3340.296846 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #131: 501it [00:19, 25.83it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.766, env_step=65500, gradient_step=65500, in_dist=4036.726, len=20415, loss/actor=-268.326, loss/alpha=0.075, loss/cql1=0.563, loss/cql2=0.563, loss/critic1=1.124, loss/critic2=1.105, n/ep=0, n/st=1, q_dataset=268.316, rew=66236.97]                          


Epoch #131: test_reward: 3346.134891 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #132: 501it [00:19, 25.58it/s, ac_loss=0.149, alpha=0.021, dataset_in_dist=466.357, env_step=66000, gradient_step=66000, in_dist=4019.538, len=20415, loss/actor=-268.436, loss/alpha=-0.042, loss/cql1=0.575, loss/cql2=0.576, loss/critic1=1.281, loss/critic2=1.275, n/ep=0, n/st=1, q_dataset=268.418, rew=66236.97]                         


Epoch #132: test_reward: 3358.107545 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #133: 501it [00:19, 25.75it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.258, env_step=66500, gradient_step=66500, in_dist=4021.270, len=20415, loss/actor=-268.312, loss/alpha=-0.025, loss/cql1=0.573, loss/cql2=0.572, loss/critic1=1.113, loss/critic2=1.110, n/ep=0, n/st=1, q_dataset=268.293, rew=66236.97]                         


Epoch #133: test_reward: 3351.205303 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #134: 501it [00:19, 25.79it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.427, env_step=67000, gradient_step=67000, in_dist=4033.590, len=20415, loss/actor=-268.501, loss/alpha=0.063, loss/cql1=0.568, loss/cql2=0.567, loss/critic1=1.477, loss/critic2=1.469, n/ep=0, n/st=1, q_dataset=268.482, rew=66236.97]                          


Epoch #134: test_reward: 3347.472959 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #135: 501it [00:19, 25.64it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.589, env_step=67500, gradient_step=67500, in_dist=4039.933, len=20415, loss/actor=-269.393, loss/alpha=0.001, loss/cql1=0.566, loss/cql2=0.566, loss/critic1=0.826, loss/critic2=0.809, n/ep=0, n/st=1, q_dataset=269.369, rew=66236.97]                          


Epoch #135: test_reward: 3359.966938 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #136: 501it [00:19, 25.59it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.972, env_step=68000, gradient_step=68000, in_dist=4025.853, len=20415, loss/actor=-268.574, loss/alpha=0.049, loss/cql1=0.574, loss/cql2=0.573, loss/critic1=1.173, loss/critic2=1.145, n/ep=0, n/st=1, q_dataset=268.546, rew=66236.97]                          


Epoch #136: test_reward: 3347.473127 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #137: 501it [00:19, 25.78it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.106, env_step=68500, gradient_step=68500, in_dist=4034.710, len=20415, loss/actor=-268.739, loss/alpha=0.129, loss/cql1=0.564, loss/cql2=0.564, loss/critic1=1.614, loss/critic2=1.612, n/ep=0, n/st=1, q_dataset=268.711, rew=66236.97]                          


Epoch #137: test_reward: 3350.850539 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #138: 501it [00:19, 25.80it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.471, env_step=69000, gradient_step=69000, in_dist=4015.013, len=20415, loss/actor=-268.524, loss/alpha=-0.119, loss/cql1=0.585, loss/cql2=0.586, loss/critic1=1.331, loss/critic2=1.320, n/ep=0, n/st=1, q_dataset=268.493, rew=66236.97]                         


Epoch #138: test_reward: 3336.799401 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #139: 501it [00:19, 25.74it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.190, env_step=69500, gradient_step=69500, in_dist=4049.396, len=20415, loss/actor=-269.202, loss/alpha=0.160, loss/cql1=0.555, loss/cql2=0.556, loss/critic1=1.031, loss/critic2=1.008, n/ep=0, n/st=1, q_dataset=269.194, rew=66236.97]                          


Epoch #139: test_reward: 3335.772344 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #140: 501it [00:19, 25.68it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.156, env_step=70000, gradient_step=70000, in_dist=4046.795, len=20415, loss/actor=-269.202, loss/alpha=0.029, loss/cql1=0.559, loss/cql2=0.560, loss/critic1=1.883, loss/critic2=1.864, n/ep=0, n/st=1, q_dataset=269.184, rew=66236.97]                          


Epoch #140: test_reward: 3349.276411 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #141: 501it [00:19, 25.54it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.660, env_step=70500, gradient_step=70500, in_dist=4030.800, len=20415, loss/actor=-268.583, loss/alpha=0.000, loss/cql1=0.570, loss/cql2=0.570, loss/critic1=1.548, loss/critic2=1.527, n/ep=0, n/st=1, q_dataset=268.564, rew=66236.97]                          


Epoch #141: test_reward: 3345.853745 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #142: 501it [00:19, 25.66it/s, ac_loss=0.149, alpha=0.021, dataset_in_dist=466.541, env_step=71000, gradient_step=71000, in_dist=4038.507, len=20415, loss/actor=-269.288, loss/alpha=-0.052, loss/cql1=0.568, loss/cql2=0.568, loss/critic1=1.432, loss/critic2=1.434, n/ep=0, n/st=1, q_dataset=269.266, rew=66236.97]                         


Epoch #142: test_reward: 2012.085017 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #143: 501it [00:19, 25.44it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.791, env_step=71500, gradient_step=71500, in_dist=4049.166, len=20415, loss/actor=-269.571, loss/alpha=0.085, loss/cql1=0.558, loss/cql2=0.558, loss/critic1=1.243, loss/critic2=1.251, n/ep=0, n/st=1, q_dataset=269.541, rew=66236.97]                          


Epoch #143: test_reward: 3363.656178 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #144: 501it [00:19, 25.49it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.368, env_step=72000, gradient_step=72000, in_dist=4034.092, len=20415, loss/actor=-269.661, loss/alpha=-0.084, loss/cql1=0.571, loss/cql2=0.571, loss/critic1=1.413, loss/critic2=1.384, n/ep=0, n/st=1, q_dataset=269.624, rew=66236.97]                         


Epoch #144: test_reward: 3323.144271 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #145: 501it [00:19, 25.62it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.959, env_step=72500, gradient_step=72500, in_dist=4024.355, len=20415, loss/actor=-269.510, loss/alpha=0.011, loss/cql1=0.573, loss/cql2=0.573, loss/critic1=1.022, loss/critic2=1.015, n/ep=0, n/st=1, q_dataset=269.483, rew=66236.97]                          


Epoch #145: test_reward: 3337.946224 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #146: 501it [00:19, 25.73it/s, ac_loss=0.150, alpha=0.021, dataset_in_dist=466.108, env_step=73000, gradient_step=73000, in_dist=4043.920, len=20415, loss/actor=-269.259, loss/alpha=0.079, loss/cql1=0.562, loss/cql2=0.562, loss/critic1=1.029, loss/critic2=1.054, n/ep=0, n/st=1, q_dataset=269.233, rew=66236.97]                         


Epoch #146: test_reward: 3353.111913 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #147: 501it [00:19, 25.32it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.488, env_step=73500, gradient_step=73500, in_dist=4034.475, len=20415, loss/actor=-269.284, loss/alpha=0.079, loss/cql1=0.565, loss/cql2=0.565, loss/critic1=1.401, loss/critic2=1.416, n/ep=0, n/st=1, q_dataset=269.268, rew=66236.97]                          


Epoch #147: test_reward: 3382.421866 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #148: 501it [00:20, 24.69it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.010, env_step=74000, gradient_step=74000, in_dist=4020.122, len=20415, loss/actor=-269.291, loss/alpha=0.077, loss/cql1=0.575, loss/cql2=0.574, loss/critic1=0.599, loss/critic2=0.606, n/ep=0, n/st=1, q_dataset=269.257, rew=66236.97]                          


Epoch #148: test_reward: 3367.400026 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #149: 501it [00:20, 25.03it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.831, env_step=74500, gradient_step=74500, in_dist=4028.932, len=20415, loss/actor=-269.549, loss/alpha=-0.080, loss/cql1=0.574, loss/cql2=0.573, loss/critic1=1.355, loss/critic2=1.326, n/ep=0, n/st=1, q_dataset=269.528, rew=66236.97]                         


Epoch #149: test_reward: 3332.649372 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #150: 501it [00:20, 24.66it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.973, env_step=75000, gradient_step=75000, in_dist=4023.187, len=20415, loss/actor=-269.921, loss/alpha=-0.012, loss/cql1=0.574, loss/cql2=0.575, loss/critic1=1.592, loss/critic2=1.606, n/ep=0, n/st=1, q_dataset=269.897, rew=66236.97]                         


Epoch #150: test_reward: 3392.864274 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #151: 501it [00:20, 24.50it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.014, env_step=75500, gradient_step=75500, in_dist=4041.854, len=20415, loss/actor=-270.054, loss/alpha=0.035, loss/cql1=0.566, loss/cql2=0.567, loss/critic1=1.539, loss/critic2=1.527, n/ep=0, n/st=1, q_dataset=270.018, rew=66236.97]                          


Epoch #151: test_reward: 3351.249146 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #152: 501it [00:20, 24.93it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.659, env_step=76000, gradient_step=76000, in_dist=4031.238, len=20415, loss/actor=-269.796, loss/alpha=0.051, loss/cql1=0.567, loss/cql2=0.568, loss/critic1=1.361, loss/critic2=1.340, n/ep=0, n/st=1, q_dataset=269.779, rew=66236.97]                          


Epoch #152: test_reward: 3335.784753 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #153: 501it [00:20, 25.02it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.868, env_step=76500, gradient_step=76500, in_dist=4008.140, len=20415, loss/actor=-269.876, loss/alpha=-0.098, loss/cql1=0.586, loss/cql2=0.585, loss/critic1=1.181, loss/critic2=1.178, n/ep=0, n/st=1, q_dataset=269.845, rew=66236.97]                         


Epoch #153: test_reward: 3337.527460 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #154: 501it [00:19, 25.45it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.062, env_step=77000, gradient_step=77000, in_dist=4031.212, len=20415, loss/actor=-270.239, loss/alpha=0.060, loss/cql1=0.567, loss/cql2=0.568, loss/critic1=1.269, loss/critic2=1.279, n/ep=0, n/st=1, q_dataset=270.217, rew=66236.97]                          


Epoch #154: test_reward: 3334.528585 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #155: 501it [00:19, 25.38it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.117, env_step=77500, gradient_step=77500, in_dist=4036.715, len=20415, loss/actor=-269.671, loss/alpha=0.064, loss/cql1=0.564, loss/cql2=0.565, loss/critic1=2.011, loss/critic2=1.993, n/ep=0, n/st=1, q_dataset=269.648, rew=66236.97]                          


Epoch #155: test_reward: 3355.360783 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #156: 501it [00:19, 25.21it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.205, env_step=78000, gradient_step=78000, in_dist=4045.782, len=20415, loss/actor=-270.269, loss/alpha=-0.120, loss/cql1=0.566, loss/cql2=0.566, loss/critic1=0.939, loss/critic2=0.967, n/ep=0, n/st=1, q_dataset=270.262, rew=66236.97]                         


Epoch #156: test_reward: 3362.797415 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #157: 501it [00:20, 25.03it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.557, env_step=78500, gradient_step=78500, in_dist=4026.625, len=20415, loss/actor=-270.129, loss/alpha=-0.003, loss/cql1=0.574, loss/cql2=0.573, loss/critic1=0.964, loss/critic2=0.951, n/ep=0, n/st=1, q_dataset=270.095, rew=66236.97]                         


Epoch #157: test_reward: 3318.293037 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #158: 501it [00:19, 25.29it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.255, env_step=79000, gradient_step=79000, in_dist=4042.880, len=20415, loss/actor=-270.256, loss/alpha=-0.049, loss/cql1=0.566, loss/cql2=0.565, loss/critic1=1.334, loss/critic2=1.333, n/ep=0, n/st=1, q_dataset=270.236, rew=66236.97]                         


Epoch #158: test_reward: 3349.431278 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #159: 501it [00:19, 25.54it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.209, env_step=79500, gradient_step=79500, in_dist=4050.054, len=20415, loss/actor=-270.266, loss/alpha=-0.085, loss/cql1=0.558, loss/cql2=0.558, loss/critic1=1.635, loss/critic2=1.593, n/ep=0, n/st=1, q_dataset=270.258, rew=66236.97]                         


Epoch #159: test_reward: 3328.713285 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #160: 501it [00:19, 25.83it/s, ac_loss=0.144, alpha=0.021, dataset_in_dist=467.504, env_step=80000, gradient_step=80000, in_dist=4035.749, len=20415, loss/actor=-270.512, loss/alpha=-0.086, loss/cql1=0.567, loss/cql2=0.568, loss/critic1=1.944, loss/critic2=1.924, n/ep=0, n/st=1, q_dataset=270.505, rew=66236.97]                         


Epoch #160: test_reward: 3388.745959 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #161: 501it [00:19, 25.80it/s, ac_loss=0.149, alpha=0.021, dataset_in_dist=466.576, env_step=80500, gradient_step=80500, in_dist=4026.791, len=20415, loss/actor=-270.712, loss/alpha=-0.087, loss/cql1=0.576, loss/cql2=0.577, loss/critic1=1.869, loss/critic2=1.841, n/ep=0, n/st=1, q_dataset=270.687, rew=66236.97]                         


Epoch #161: test_reward: 3349.483196 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #162: 501it [00:19, 25.80it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.462, env_step=81000, gradient_step=81000, in_dist=4032.607, len=20415, loss/actor=-270.425, loss/alpha=0.011, loss/cql1=0.569, loss/cql2=0.570, loss/critic1=1.478, loss/critic2=1.455, n/ep=0, n/st=1, q_dataset=270.405, rew=66236.97]                          


Epoch #162: test_reward: 3331.969647 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #163: 501it [00:19, 25.93it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.238, env_step=81500, gradient_step=81500, in_dist=4038.113, len=20415, loss/actor=-270.595, loss/alpha=0.085, loss/cql1=0.565, loss/cql2=0.565, loss/critic1=1.022, loss/critic2=1.011, n/ep=0, n/st=1, q_dataset=270.563, rew=66236.97]                          


Epoch #163: test_reward: 3354.525669 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #164: 501it [00:19, 25.88it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.998, env_step=82000, gradient_step=82000, in_dist=4071.786, len=20415, loss/actor=-270.448, loss/alpha=0.130, loss/cql1=0.542, loss/cql2=0.543, loss/critic1=2.348, loss/critic2=2.349, n/ep=0, n/st=1, q_dataset=270.434, rew=66236.97]                          


Epoch #164: test_reward: 3335.216373 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #165: 501it [00:19, 25.43it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.960, env_step=82500, gradient_step=82500, in_dist=4025.840, len=20415, loss/actor=-271.100, loss/alpha=-0.157, loss/cql1=0.578, loss/cql2=0.577, loss/critic1=1.532, loss/critic2=1.495, n/ep=0, n/st=1, q_dataset=271.078, rew=66236.97]                         


Epoch #165: test_reward: 3332.464888 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #166: 501it [00:19, 25.71it/s, ac_loss=0.144, alpha=0.021, dataset_in_dist=467.290, env_step=83000, gradient_step=83000, in_dist=4040.617, len=20415, loss/actor=-270.682, loss/alpha=-0.190, loss/cql1=0.571, loss/cql2=0.571, loss/critic1=1.805, loss/critic2=1.794, n/ep=0, n/st=1, q_dataset=270.658, rew=66236.97]                         


Epoch #166: test_reward: 3333.593396 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #167: 501it [00:19, 25.70it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.498, env_step=83500, gradient_step=83500, in_dist=4019.979, len=20415, loss/actor=-270.702, loss/alpha=0.077, loss/cql1=0.573, loss/cql2=0.573, loss/critic1=1.221, loss/critic2=1.217, n/ep=0, n/st=1, q_dataset=270.672, rew=66236.97]                          


Epoch #167: test_reward: 3364.735835 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #168: 501it [00:19, 25.40it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=467.077, env_step=84000, gradient_step=84000, in_dist=4028.698, len=26128, loss/actor=-270.646, loss/alpha=0.012, loss/cql1=0.572, loss/cql2=0.571, loss/critic1=1.504, loss/critic2=1.520, n/ep=0, n/st=1, q_dataset=270.615, rew=85252.16]                          


Epoch #168: test_reward: 3362.698601 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #169: 501it [00:19, 25.21it/s, ac_loss=0.144, alpha=0.021, dataset_in_dist=467.748, env_step=84500, gradient_step=84500, in_dist=4049.660, len=26128, loss/actor=-270.957, loss/alpha=0.187, loss/cql1=0.560, loss/cql2=0.560, loss/critic1=1.909, loss/critic2=1.854, n/ep=0, n/st=1, q_dataset=270.928, rew=85252.16]                          


Epoch #169: test_reward: 3351.259408 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #170: 501it [00:19, 25.19it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.289, env_step=85000, gradient_step=85000, in_dist=4031.357, len=26128, loss/actor=-270.821, loss/alpha=0.006, loss/cql1=0.572, loss/cql2=0.573, loss/critic1=1.840, loss/critic2=1.840, n/ep=0, n/st=1, q_dataset=270.780, rew=85252.16]                          


Epoch #170: test_reward: 3349.812292 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #171: 501it [00:19, 25.74it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.806, env_step=85500, gradient_step=85500, in_dist=4014.407, len=26128, loss/actor=-270.861, loss/alpha=-0.134, loss/cql1=0.585, loss/cql2=0.587, loss/critic1=0.958, loss/critic2=0.941, n/ep=0, n/st=1, q_dataset=270.822, rew=85252.16]                         


Epoch #171: test_reward: 3373.145789 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #172: 501it [00:19, 25.71it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.804, env_step=86000, gradient_step=86000, in_dist=4015.397, len=26128, loss/actor=-271.175, loss/alpha=-0.178, loss/cql1=0.589, loss/cql2=0.589, loss/critic1=2.092, loss/critic2=2.097, n/ep=0, n/st=1, q_dataset=271.146, rew=85252.16]                         


Epoch #172: test_reward: 2525.776223 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #173: 501it [00:19, 25.69it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.909, env_step=86500, gradient_step=86500, in_dist=4052.094, len=26128, loss/actor=-271.465, loss/alpha=-0.009, loss/cql1=0.560, loss/cql2=0.560, loss/critic1=1.401, loss/critic2=1.400, n/ep=0, n/st=1, q_dataset=271.425, rew=85252.16]                         


Epoch #173: test_reward: 3339.686167 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #174: 501it [00:19, 25.60it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.658, env_step=87000, gradient_step=87000, in_dist=4018.092, len=26128, loss/actor=-271.216, loss/alpha=0.020, loss/cql1=0.575, loss/cql2=0.575, loss/critic1=1.909, loss/critic2=1.903, n/ep=0, n/st=1, q_dataset=271.192, rew=85252.16]                          


Epoch #174: test_reward: 3350.098484 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #175: 501it [00:19, 25.80it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.530, env_step=87500, gradient_step=87500, in_dist=4012.196, len=26128, loss/actor=-271.010, loss/alpha=-0.092, loss/cql1=0.583, loss/cql2=0.584, loss/critic1=1.540, loss/critic2=1.534, n/ep=0, n/st=1, q_dataset=270.989, rew=85252.16]                         


Epoch #175: test_reward: 3336.274415 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #176: 501it [00:19, 25.64it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.404, env_step=88000, gradient_step=88000, in_dist=4038.179, len=26128, loss/actor=-271.459, loss/alpha=-0.072, loss/cql1=0.571, loss/cql2=0.570, loss/critic1=1.352, loss/critic2=1.348, n/ep=0, n/st=1, q_dataset=271.415, rew=85252.16]                         


Epoch #176: test_reward: 3313.308862 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #177: 501it [00:19, 25.94it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.114, env_step=88500, gradient_step=88500, in_dist=4020.372, len=26128, loss/actor=-271.345, loss/alpha=0.012, loss/cql1=0.579, loss/cql2=0.580, loss/critic1=1.682, loss/critic2=1.651, n/ep=0, n/st=1, q_dataset=271.295, rew=85252.16]                          


Epoch #177: test_reward: 3348.974474 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #178: 501it [00:19, 25.82it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.878, env_step=89000, gradient_step=89000, in_dist=4041.064, len=26128, loss/actor=-271.520, loss/alpha=-0.092, loss/cql1=0.570, loss/cql2=0.571, loss/critic1=1.182, loss/critic2=1.195, n/ep=0, n/st=1, q_dataset=271.486, rew=85252.16]                         


Epoch #178: test_reward: 3264.402375 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #179: 501it [00:20, 24.64it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.160, env_step=89500, gradient_step=89500, in_dist=4024.978, len=26128, loss/actor=-272.167, loss/alpha=0.036, loss/cql1=0.574, loss/cql2=0.575, loss/critic1=1.656, loss/critic2=1.660, n/ep=0, n/st=1, q_dataset=272.123, rew=85252.16]                          


Epoch #179: test_reward: 3373.069134 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #180: 501it [00:19, 25.27it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.745, env_step=90000, gradient_step=90000, in_dist=4029.535, len=26128, loss/actor=-271.763, loss/alpha=0.056, loss/cql1=0.573, loss/cql2=0.573, loss/critic1=1.637, loss/critic2=1.624, n/ep=0, n/st=1, q_dataset=271.732, rew=85252.16]                          


Epoch #180: test_reward: 2928.171330 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #181: 501it [00:19, 25.96it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.238, env_step=90500, gradient_step=90500, in_dist=4005.930, len=26128, loss/actor=-271.594, loss/alpha=-0.255, loss/cql1=0.593, loss/cql2=0.593, loss/critic1=1.789, loss/critic2=1.762, n/ep=0, n/st=1, q_dataset=271.557, rew=85252.16]                         


Epoch #181: test_reward: 3378.583695 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #182: 501it [00:19, 25.96it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.717, env_step=91000, gradient_step=91000, in_dist=3994.054, len=26128, loss/actor=-271.788, loss/alpha=-0.057, loss/cql1=0.592, loss/cql2=0.591, loss/critic1=2.143, loss/critic2=2.126, n/ep=0, n/st=1, q_dataset=271.760, rew=85252.16]                         


Epoch #182: test_reward: 3345.556280 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #183: 501it [00:19, 25.93it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.881, env_step=91500, gradient_step=91500, in_dist=4040.379, len=26128, loss/actor=-271.761, loss/alpha=-0.083, loss/cql1=0.569, loss/cql2=0.569, loss/critic1=1.154, loss/critic2=1.183, n/ep=0, n/st=1, q_dataset=271.732, rew=85252.16]                         


Epoch #183: test_reward: 3232.849214 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #184: 501it [00:19, 25.95it/s, ac_loss=0.149, alpha=0.021, dataset_in_dist=466.462, env_step=92000, gradient_step=92000, in_dist=4023.195, len=26128, loss/actor=-272.340, loss/alpha=0.119, loss/cql1=0.568, loss/cql2=0.568, loss/critic1=1.646, loss/critic2=1.643, n/ep=0, n/st=1, q_dataset=272.304, rew=85252.16]                         


Epoch #184: test_reward: 3335.783347 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #185: 501it [00:19, 25.96it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.418, env_step=92500, gradient_step=92500, in_dist=4005.884, len=26128, loss/actor=-271.881, loss/alpha=-0.037, loss/cql1=0.587, loss/cql2=0.587, loss/critic1=1.777, loss/critic2=1.768, n/ep=0, n/st=1, q_dataset=271.839, rew=85252.16]                         


Epoch #185: test_reward: 2587.661881 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #186: 501it [00:19, 25.92it/s, ac_loss=0.144, alpha=0.021, dataset_in_dist=467.584, env_step=93000, gradient_step=93000, in_dist=3997.457, len=26128, loss/actor=-271.846, loss/alpha=-0.091, loss/cql1=0.594, loss/cql2=0.594, loss/critic1=2.494, loss/critic2=2.498, n/ep=0, n/st=1, q_dataset=271.810, rew=85252.16]                         


Epoch #186: test_reward: 1972.661536 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #187: 501it [00:19, 25.97it/s, ac_loss=0.149, alpha=0.021, dataset_in_dist=466.431, env_step=93500, gradient_step=93500, in_dist=3996.046, len=26128, loss/actor=-271.705, loss/alpha=-0.013, loss/cql1=0.591, loss/cql2=0.590, loss/critic1=1.852, loss/critic2=1.848, n/ep=0, n/st=1, q_dataset=271.659, rew=85252.16]                         


Epoch #187: test_reward: 3382.980280 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #188: 501it [00:19, 25.92it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=466.883, env_step=94000, gradient_step=94000, in_dist=4012.008, len=26128, loss/actor=-272.439, loss/alpha=-0.036, loss/cql1=0.580, loss/cql2=0.581, loss/critic1=1.360, loss/critic2=1.318, n/ep=0, n/st=1, q_dataset=272.415, rew=85252.16]                         


Epoch #188: test_reward: 3340.512199 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #189: 501it [00:19, 25.99it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.177, env_step=94500, gradient_step=94500, in_dist=4018.383, len=26128, loss/actor=-272.053, loss/alpha=0.094, loss/cql1=0.577, loss/cql2=0.576, loss/critic1=1.341, loss/critic2=1.345, n/ep=0, n/st=1, q_dataset=272.009, rew=85252.16]                          


Epoch #189: test_reward: 3359.968185 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #190: 501it [00:19, 25.91it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.792, env_step=95000, gradient_step=95000, in_dist=4038.528, len=26128, loss/actor=-272.170, loss/alpha=0.053, loss/cql1=0.562, loss/cql2=0.563, loss/critic1=1.723, loss/critic2=1.710, n/ep=0, n/st=1, q_dataset=272.133, rew=85252.16]                          


Epoch #190: test_reward: 3340.366484 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #191: 501it [00:19, 25.29it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.137, env_step=95500, gradient_step=95500, in_dist=4017.260, len=26128, loss/actor=-271.838, loss/alpha=0.052, loss/cql1=0.575, loss/cql2=0.575, loss/critic1=1.839, loss/critic2=1.855, n/ep=0, n/st=1, q_dataset=271.799, rew=85252.16]                          


Epoch #191: test_reward: 3340.485508 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #192: 501it [00:19, 25.22it/s, ac_loss=0.146, alpha=0.021, dataset_in_dist=467.025, env_step=96000, gradient_step=96000, in_dist=4014.873, len=26128, loss/actor=-271.632, loss/alpha=-0.157, loss/cql1=0.584, loss/cql2=0.585, loss/critic1=1.825, loss/critic2=1.820, n/ep=0, n/st=1, q_dataset=271.610, rew=85252.16]                         


Epoch #192: test_reward: 3344.545177 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #193: 501it [00:20, 24.88it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.779, env_step=96500, gradient_step=96500, in_dist=4029.365, len=26128, loss/actor=-271.816, loss/alpha=-0.025, loss/cql1=0.573, loss/cql2=0.573, loss/critic1=1.871, loss/critic2=1.887, n/ep=0, n/st=1, q_dataset=271.789, rew=85252.16]                         


Epoch #193: test_reward: 3348.020798 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #194: 501it [00:19, 25.70it/s, ac_loss=0.148, alpha=0.021, dataset_in_dist=466.700, env_step=97000, gradient_step=97000, in_dist=4047.584, len=26128, loss/actor=-272.462, loss/alpha=0.002, loss/cql1=0.560, loss/cql2=0.560, loss/critic1=1.534, loss/critic2=1.513, n/ep=0, n/st=1, q_dataset=272.433, rew=85252.16]                          


Epoch #194: test_reward: 3376.930135 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #195: 501it [00:19, 25.78it/s, ac_loss=0.145, alpha=0.021, dataset_in_dist=467.665, env_step=97500, gradient_step=97500, in_dist=4041.127, len=26128, loss/actor=-271.613, loss/alpha=-0.102, loss/cql1=0.570, loss/cql2=0.571, loss/critic1=1.681, loss/critic2=1.689, n/ep=0, n/st=1, q_dataset=271.570, rew=85252.16]                         


Epoch #195: test_reward: 3340.413967 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #196: 501it [00:19, 25.85it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.701, env_step=98000, gradient_step=98000, in_dist=4024.804, len=26128, loss/actor=-272.637, loss/alpha=0.040, loss/cql1=0.572, loss/cql2=0.572, loss/critic1=1.856, loss/critic2=1.794, n/ep=0, n/st=1, q_dataset=272.604, rew=85252.16]                          


Epoch #196: test_reward: 3371.509502 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #197: 501it [00:19, 25.99it/s, ac_loss=0.144, alpha=0.021, dataset_in_dist=467.476, env_step=98500, gradient_step=98500, in_dist=4017.366, len=26128, loss/actor=-272.428, loss/alpha=0.037, loss/cql1=0.579, loss/cql2=0.578, loss/critic1=2.071, loss/critic2=2.055, n/ep=0, n/st=1, q_dataset=272.388, rew=85252.16]                          


Epoch #197: test_reward: 3337.762505 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #198: 501it [00:19, 25.98it/s, ac_loss=0.149, alpha=0.021, dataset_in_dist=466.299, env_step=99000, gradient_step=99000, in_dist=4044.593, len=26128, loss/actor=-272.210, loss/alpha=0.027, loss/cql1=0.564, loss/cql2=0.563, loss/critic1=1.336, loss/critic2=1.314, n/ep=0, n/st=1, q_dataset=272.172, rew=85252.16]                          


Epoch #198: test_reward: 3370.706546 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #199: 501it [00:19, 26.04it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.691, env_step=99500, gradient_step=99500, in_dist=4034.594, len=26128, loss/actor=-272.369, loss/alpha=0.079, loss/cql1=0.567, loss/cql2=0.567, loss/critic1=1.417, loss/critic2=1.428, n/ep=0, n/st=1, q_dataset=272.333, rew=85252.16]                          


Epoch #199: test_reward: 3364.718145 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93


Epoch #200: 501it [00:19, 25.94it/s, ac_loss=0.147, alpha=0.021, dataset_in_dist=466.922, env_step=100000, gradient_step=100000, in_dist=4006.097, len=26128, loss/actor=-272.560, loss/alpha=-0.058, loss/cql1=0.590, loss/cql2=0.589, loss/critic1=1.979, loss/critic2=1.972, n/ep=0, n/st=1, q_dataset=272.519, rew=85252.16]                         


Epoch #200: test_reward: 3358.332979 ± 0.000000, best_reward: 3495.840031 ± 0.000000 in #93
