In [15]:
from collections import defaultdict
from matplotlib import pyplot as plt
import torch
from torch import nn
import torchrl
import torchrl.envs as torch_envs
from tqdm import tqdm
import gymnasium as gym
import tensordict
from tensordict import nn as dict_nn
import torchsummary

from spaceship_env import SpaceshipEnv

In [16]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
device = torch.device("cuda")
lr = 3e-4
max_grad_norm = 1.0

frames_per_batch = 10000
total_frames = 100_000_000

sub_batch_size = 512
num_epochs = 10
clip_epsilon = 0.2
gamma = 0.99
lmbda = 0.95
entropy_eps = 1e-4

In [18]:
def make_norm_transforms(env: gym.Env):
    transforms = []
    for key, space in env.observation_space.items():
        if key in ["position", "target", "velocity", "rotation"]:
            transforms.append(torch_envs.transforms.ObservationNorm(loc=space.low, scale=1 / (space.high-space.low), in_keys=key, out_keys=key, standard_normal=False))
    return torch_envs.transforms.Compose(*transforms)
        

In [19]:
make_norm_transforms(SpaceshipEnv())

Compose(
        ObservationNorm(keys=['position']),
        ObservationNorm(keys=['rotation']),
        ObservationNorm(keys=['target']),
        ObservationNorm(keys=['velocity']))

In [20]:
env = SpaceshipEnv()
for _ in range(5):
    print(env.step(2))

reward: -0.0426319411239956
({'position': array([100.  ,  99.84]), 'target': array([500, 500]), 'velocity': array([ 0.  , -0.16]), 'rotation': array([0, 1, 0, 0]), 'step_count': 1}, np.float64(-0.0426319411239956), False, False, None)
reward: -0.06528196247338515
({'position': array([100.  ,  99.52]), 'target': array([500, 500]), 'velocity': array([ 0.  , -0.32]), 'rotation': array([0, 1, 0, 0]), 'step_count': 2}, np.float64(-0.06528196247338515), False, False, None)
reward: -0.08796356324530175
({'position': array([100.  ,  99.04]), 'target': array([500, 500]), 'velocity': array([ 0.  , -0.48]), 'rotation': array([0, 1, 0, 0]), 'step_count': 3}, np.float64(-0.08796356324530175), False, False, None)
reward: -0.11069014535506155
({'position': array([100. ,  98.4]), 'target': array([500, 500]), 'velocity': array([ 0.  , -0.64]), 'rotation': array([0, 1, 0, 0]), 'step_count': 4}, np.float64(-0.11069014535506155), False, False, None)
reward: -0.1334749734704635
({'position': array([100. , 

In [21]:
gym.register('Spaceship_Target', entry_point="spaceship_env:SpaceshipEnv")

env = torch_envs.GymEnv('Spaceship_Target', device=device)
print(env.observation_spec.keys())
env = torch_envs.transforms.TransformedEnv(base_env=env, 
                                             transform=torch_envs.Compose([
                                                 make_norm_transforms(env),
                                                 torch_envs.transforms.CatTensors(["position", "target", "velocity", "rotation"], "observation")
                                                 ]))

logged_env = torch_envs.GymEnv('Spaceship_Target', device=torch.device('cuda'), return_pixels=True)
logged_env = torch_envs.transforms.TransformedEnv(base_env=logged_env, 
                                             transform=torch_envs.Compose([
                                                 make_norm_transforms(env),
                                                 torch_envs.transforms.CatTensors(["position", "target", "velocity", "rotation"], "observation")
                                                 ]))

_CompositeSpecKeysView(keys=['position', 'rotation', 'step_count', 'target', 'velocity'])


  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


In [22]:
logged_env.observation_spec.keys()

_CompositeSpecKeysView(keys=['step_count', 'pixels', 'observation'])

In [23]:
env.rollout(3)['observation']

reward: 0.009248833446946526
reward: 0.014910308187103833
reward: 0.020515055219362557


tensor([[7.1429e-02, 1.0000e-01, 1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         2.4214e-01, 2.0600e-01, 0.0000e+00, 0.0000e+00],
        [7.1543e-02, 1.0000e-01, 1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         2.4214e-01, 2.0600e-01, 8.0000e-04, 0.0000e+00],
        [7.1629e-02, 1.0016e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
         2.4214e-01, 2.0600e-01, 6.0000e-04, 8.0000e-04]], device='cuda:0')

In [24]:
from torchrl.modules.tensordict_module import ProbabilisticActor

actor = nn.Sequential(
    nn.Linear(env.observation_spec['observation'].shape[0], 128, device=device),
    nn.Tanh(),
    nn.LazyLinear(128, device=device),
    nn.Tanh(),
    nn.LazyLinear(128, device=device),
    nn.Tanh(),
    nn.LazyLinear(3, device=device),
)

policy_module = dict_nn.TensorDictModule(actor, in_keys=["observation"], out_keys=["logits"])
policy_module = ProbabilisticActor(module=policy_module,
                                    spec=env.action_spec,
                                    in_keys=["logits"],
                                    distribution_class=torch.distributions.OneHotCategorical,
                                    return_log_prob=True)

In [25]:
import torchsummary
torchsummary.summary(actor, (10,))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                  [-1, 128]           1,408
              Tanh-2                  [-1, 128]               0
            Linear-3                  [-1, 128]          16,512
              Tanh-4                  [-1, 128]               0
            Linear-5                  [-1, 128]          16,512
              Tanh-6                  [-1, 128]               0
            Linear-7                    [-1, 3]             387
Total params: 34,819
Trainable params: 34,819
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.13
Estimated Total Size (MB): 0.14
----------------------------------------------------------------


In [26]:
from torchrl.modules import ValueOperator

value_net = nn.Sequential(
    nn.Linear(env.observation_spec['observation'].shape[0], 128, device=device),
    nn.Tanh(),
    nn.LazyLinear(128, device=device),
    nn.Tanh(),
    nn.LazyLinear(1, device=device),
)

value_module = ValueOperator(value_net, in_keys=["observation"])

In [27]:
print(policy_module(env.reset())['action'])
print(value_module(env.reset()))

tensor([1., 0., 0.], device='cuda:0')
TensorDict(
    fields={
        done: Tensor(shape=torch.Size([1]), device=cuda:0, dtype=torch.bool, is_shared=True),
        observation: Tensor(shape=torch.Size([10]), device=cuda:0, dtype=torch.float32, is_shared=True),
        state_value: Tensor(shape=torch.Size([1]), device=cuda:0, dtype=torch.float32, is_shared=True),
        step_count: Tensor(shape=torch.Size([1]), device=cuda:0, dtype=torch.float64, is_shared=True),
        terminated: Tensor(shape=torch.Size([1]), device=cuda:0, dtype=torch.bool, is_shared=True),
        truncated: Tensor(shape=torch.Size([1]), device=cuda:0, dtype=torch.bool, is_shared=True)},
    batch_size=torch.Size([]),
    device=cuda:0,
    is_shared=True)


In [28]:
out = env.rollout(1000, lambda x, y: torch.tensor([1, 0, 0], dtype=torch.long))
print(out['observation'])
sum(out['done']), len(out['observation'])

reward: 0.04430362183039811
reward: 0.057193123048842984
reward: 0.07643068616187222
reward: 0.09562928622762686
reward: 0.11478151906683633
reward: 0.13387992991967937
reward: 0.15291700539135664
reward: 0.1718851653326314
reward: 0.19077675464697874
reward: 0.16003244695190474
reward: 0.1292447791582419
reward: 0.09841207867194654
reward: 0.06753272075801156
reward: 0.03660513156531642
reward: 0.0056277910457806675
reward: -0.025400764242123806
reward: -0.05648193845506942
reward: -0.08761707392091969
reward: -0.11880744873779107
reward: -0.1500542745242712
reward: -0.1722797533163239
reward: -0.19432719205952062
reward: -0.2162181423940297
reward: -0.23797420947956208
reward: -0.25961702454274616
reward: -0.27186372440194206
reward: -0.284055195895221
reward: -0.2962021312357168
reward: -0.30831518906041605
reward: -0.320404987378357
reward: -0.33248209665417755
reward: -0.28876223443149035
reward: -0.24504519551490248
reward: -0.2013360360967929
reward: -0.1576398062532751
reward: 

(tensor([0], device='cuda:0'), 1000)

In [29]:
for row in out['observation']:
    print([round(float(num), 3) for num in row])

[0.072, 0.1, 0.0, 0.0, 0.0, 1.0, 0.836, 0.911, 0.0, 0.002]
[0.072, 0.101, 0.0, 0.0, 0.0, 1.0, 0.836, 0.911, 0.0, 0.002]
[0.072, 0.102, 0.0, 0.0, 0.0, 1.0, 0.836, 0.911, 0.0, 0.003]
[0.072, 0.102, 0.0, 0.0, 0.0, 1.0, 0.836, 0.911, 0.0, 0.004]
[0.072, 0.103, 0.0, 0.0, 0.0, 1.0, 0.836, 0.911, 0.0, 0.005]
[0.072, 0.104, 0.0, 0.0, 0.0, 1.0, 0.836, 0.911, 0.0, 0.006]
[0.072, 0.106, 0.0, 0.0, 0.0, 1.0, 0.836, 0.911, 0.0, 0.006]
[0.072, 0.107, 0.0, 0.0, 0.0, 1.0, 0.836, 0.911, 0.0, 0.007]
[0.072, 0.109, 0.0, 0.0, 0.0, 1.0, 0.836, 0.911, 0.0, 0.008]
[0.072, 0.111, 0.0, 0.0, 0.0, 1.0, 0.836, 0.911, 0.0, 0.009]
[0.072, 0.112, 0.0, 0.0, 1.0, 0.0, 0.836, 0.911, -0.001, 0.009]
[0.071, 0.114, 0.0, 0.0, 1.0, 0.0, 0.836, 0.911, -0.002, 0.008]
[0.071, 0.116, 0.0, 0.0, 1.0, 0.0, 0.836, 0.911, -0.002, 0.008]
[0.071, 0.117, 0.0, 0.0, 1.0, 0.0, 0.836, 0.911, -0.003, 0.008]
[0.07, 0.119, 0.0, 0.0, 1.0, 0.0, 0.836, 0.911, -0.004, 0.008]
[0.069, 0.12, 0.0, 0.0, 1.0, 0.0, 0.836, 0.911, -0.005, 0.008]
[0.069, 0.

In [30]:
import torch.nn.functional as F
def simple_rollout(env, action_index, steps):
    td = env.reset()
    results = []
    
    # Get the number of classes from the environment spec
    # usually env.action_spec.shape[-1] for OneHot specs
    n_actions = env.action_spec.shape[-1]
    
    # Create the One-Hot tensor: e.g., 2 -> [0, 0, 1]
    action_one_hot = F.one_hot(torch.tensor(action_index), n_actions)
    
    for _ in range(steps):
        td['action'] = action_one_hot
        td = env.step(td)
        
        # Note: To print the index (2) instead of the vector, use argmax
        print(f"Action: {td['action'].argmax().item()} | Reward: {td[('next', 'reward')].item()}")
        
        results.append(td.clone())
        td = env.step_mdp(td)
    return torch.stack(results)

simple_rollout(env, torch.tensor(2), 100)

reward: -0.19451681362173245
Action: 2 | Reward: -0.19451680779457092
reward: -0.14955124265228717
Action: 2 | Reward: -0.14955124258995056
reward: -0.11823247448029399
Action: 2 | Reward: -0.11823247373104095
reward: -0.08693478564953208
Action: 2 | Reward: -0.08693478256464005
reward: -0.055659447467867704
Action: 2 | Reward: -0.055659446865320206
reward: -0.024407757775091608
Action: 2 | Reward: -0.024407757446169853
reward: 0.006818956679603135
Action: 2 | Reward: 0.006818956695497036
reward: 0.03801933779029158
Action: 2 | Reward: 0.0380193367600441
reward: 0.06919199356903412
Action: 2 | Reward: 0.06919199228286743
reward: 0.10033549553422912
Action: 2 | Reward: 0.100335493683815
reward: 0.07629793074831649
Action: 2 | Reward: 0.07629793137311935
reward: 0.05218473028072483
Action: 2 | Reward: 0.052184730768203735
reward: 0.027989750631468708
Action: 2 | Reward: 0.02798975072801113
reward: 0.0037069197863581933
Action: 2 | Reward: 0.00370691972784698
reward: -0.0206697536240263
A

  action_one_hot = F.one_hot(torch.tensor(action_index), n_actions)


TensorDict(
    fields={
        action: Tensor(shape=torch.Size([100, 3]), device=cuda:0, dtype=torch.int64, is_shared=True),
        done: Tensor(shape=torch.Size([100, 1]), device=cuda:0, dtype=torch.bool, is_shared=True),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([100, 1]), device=cuda:0, dtype=torch.bool, is_shared=True),
                observation: Tensor(shape=torch.Size([100, 10]), device=cuda:0, dtype=torch.float32, is_shared=True),
                reward: Tensor(shape=torch.Size([100, 1]), device=cuda:0, dtype=torch.float32, is_shared=True),
                step_count: Tensor(shape=torch.Size([100, 1]), device=cuda:0, dtype=torch.float64, is_shared=True),
                terminated: Tensor(shape=torch.Size([100, 1]), device=cuda:0, dtype=torch.bool, is_shared=True),
                truncated: Tensor(shape=torch.Size([100, 1]), device=cuda:0, dtype=torch.bool, is_shared=True)},
            batch_size=torch.Size([100]),
      

In [31]:
from torchrl.objectives.value import GAE
from torchrl.objectives import ClipPPOLoss

advantage_module = GAE(gamma=gamma, lmbda=lmbda, value_network=value_module, average_gae=True, device=device)
loss_module = ClipPPOLoss(actor_network=policy_module, critic_network=value_module, clip_epsilon=clip_epsilon, entropy_coeff=entropy_eps)
optim = torch.optim.Adam(loss_module.parameters(), lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, total_frames // frames_per_batch, 0.0)

In [32]:
from torchrl.collectors import SyncDataCollector
from torchrl.data import ReplayBuffer, LazyTensorStorage, SamplerWithoutReplacement

collector = SyncDataCollector(env, policy_module, frames_per_batch=frames_per_batch, total_frames=total_frames, device=device)
replay_buffer = ReplayBuffer(storage=LazyTensorStorage(max_size=frames_per_batch), sampler=SamplerWithoutReplacement())

In [33]:
from torchrl.record import VideoRecorder
from torchrl.record.loggers.csv import CSVLogger

logger = CSVLogger(exp_name="Spaceship_Target", log_dir="target1_videos", video_format="mp4")
logged_env = torch_envs.transforms.TransformedEnv(logged_env, VideoRecorder(logger, tag="run_video", in_keys=['pixels']))  # should just use render

  instance: EnvBase = super(_EnvPostInit, self).__call__(*args, **kwargs)


In [34]:
out = logged_env.rollout(1000, lambda x, y, z: torch.tensor([0, 1, 0]))
logged_env.transform[-1].dump()

reward: 0.011011064811942958
reward: 0.04202045772474977
reward: 0.07302691894156198
reward: 0.10402917923356521
reward: 0.1350259559347811
reward: 0.1660159488586629
reward: 0.19699783611248015
reward: 0.22797026978453525
reward: 0.2589318714780749
reward: 0.28988122766432955
reward: 0.3208168848254094
reward: 0.35173734435578574
reward: 0.38264105718876773
reward: 0.413526418111704
reward: 0.44439175973057266
reward: 0.47523534604111406
reward: 0.5060553655596659
reward: 0.5368499239623202
reward: 0.5676170361758597
reward: 0.5983546178580827
reward: 0.6290604761984844
reward: 0.6597322999627308
reward: 0.6903676486958186
reward: 0.7209639409891139
reward: 0.7515184417054384
reward: 0.7820282480438505
reward: 0.8124902743114943
reward: 0.8429012352536618
reward: 0.8732576277746533
reward: 0.9035557108608724
reward: 0.9337914834933654
reward: 0.9639606603092798
reward: 0.9940586447399054
reward: 1.0240804993164034
reward: 1.0540209127922762
reward: 1.083874163683141
reward: 1.11363407



In [35]:
type(env.render())

numpy.ndarray

In [36]:
for data in collector:
    print(data)
    break

reward: 0.10093666586849667
reward: 0.07209362769872017
reward: 0.04319244328979216
reward: 0.014228418589964962
reward: -0.014803036818845993
reward: -0.04390639721811265
reward: -0.07308600860362625
reward: -0.10234607694901808
reward: -0.13169065693880255
reward: -0.15908802581807108
reward: -0.18630187495506045
reward: -0.2133624287678654
reward: -0.22942709365499867
reward: -0.2454174770120929
reward: -0.26134899818351665
reward: -0.27723702465280925
reward: -0.2930968503256611
reward: -0.30894367432205694
reward: -0.32479258027791497
reward: -0.34065851615562925
reward: -0.3000283820387308
reward: -0.25943990569661024
reward: -0.21890568548894046
reward: -0.17843836893782647
reward: -1.1497414718566885
reward: 0.007912998974170988
reward: 0.03581969245361179
reward: 0.0637153321876824
reward: 0.0915951392237094
reward: 0.11945429091654793
reward: 0.14728790773296965
reward: 0.17509103978869225
reward: 0.20285865305405773
reward: 0.2305856151621137
reward: 0.2582666807501515
rewar

In [None]:
from tensordict.nn import set_interaction_type, InteractionType

logs = defaultdict(list)
pbar = tqdm(total=total_frames)
eval_str = ""

for i, tensordict_data in enumerate(collector):
    for _ in range(num_epochs):
        advantage_module(tensordict_data)
        data_view = tensordict_data.reshape(-1)
        replay_buffer.extend(data_view.cpu()) # not exactly sure why cpu
        for _ in range(frames_per_batch // sub_batch_size):
            subdata = replay_buffer.sample(sub_batch_size)
            loss_vals = loss_module(subdata.to(device))
            loss_value = loss_vals["loss_objective"] + loss_vals["loss_critic"] + loss_vals["loss_entropy"]
            loss_value.backward()
            torch.nn.utils.clip_grad_norm_(loss_module.parameters(), max_grad_norm)
            optim.step()
            optim.zero_grad()
            
        logs["reward"].append(tensordict_data["next", "reward"].mean().item())
        pbar.update(tensordict_data.numel())
        
    cum_reward_str = (
        f"average reward={logs['reward'][-1]: 4.4f} (init={logs['reward'][0]: 4.4f})"
    )
    logs["step_count"].append(tensordict_data["step_count"].max().item())
    stepcount_str = f"step count (max): {logs['step_count'][-1]}"
    logs["lr"].append(optim.param_groups[0]["lr"])
    lr_str = f"lr policy: {logs['lr'][-1]: 4.4f}"
    
    if i % 5 == 0:
        with set_interaction_type(InteractionType.DETERMINISTIC), torch.no_grad():  # magic
            eval_rollout = logged_env.rollout(600, policy_module)
            logged_env.transform[-1].dump()
            logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())
            logs["eval reward (sum)"].append(
                eval_rollout["next", "reward"].sum().item()
            )
            logs["eval step_count"].append(eval_rollout["step_count"].max().item())
            eval_str = (
                f"eval cumulative reward: {logs['eval reward (sum)'][-1]: 4.4f} "
                f"(init: {logs['eval reward (sum)'][0]: 4.4f}), "
                f"eval step-count: {logs['eval step_count'][-1]}"
            )
            del eval_rollout
            
    pbar.set_description(", ".join([eval_str, cum_reward_str, stepcount_str, lr_str]))
    scheduler.step()


  0%|          | 0/100000000 [00:00<?, ?it/s]

reward: 0.5481806423995538
reward: 0.5605021663857764
reward: 0.5728124967109194
reward: 0.5851013500879881
reward: 0.5973583216542865
reward: 0.6095728764104265
reward: 0.6217343404473175
reward: 0.6338318919616008
reward: 0.6458545520604637
reward: 0.6577911753573421
reward: 0.6788864347601409
reward: 0.7000463398834276
reward: 0.7212644578139135
reward: 0.7425343140168843
reward: 0.7638493840693358
reward: 0.7852030851035177
reward: 0.8065887669401798
reward: 0.8279997028894515
reward: 0.8494290801958208
reward: 0.8708699901021525
reward: 0.8923154175060363
reward: 0.8579779900319732
reward: 0.8236220024213811
reward: 0.7892335291666583
reward: 0.7547984664464262
reward: 0.7203025569514101
reward: 0.6857314157647306
reward: 0.6510705572543287
reward: 0.6163054229295915
reward: 0.5814214102083614
reward: 0.5464039020344295
reward: 0.5112382972793649
reward: 0.5365130325359461
reward: 0.5616924715056067
reward: 0.5867711249169277
reward: 0.6117433388512286
reward: 0.6366032843683822
r

  0%|          | 100000/100000000 [00:12<1:25:08, 19555.93it/s]

reward: 0.009342180354889946
reward: 0.03866806132471853
reward: 0.06796521622935564
reward: 0.09722089072562552
reward: 0.12642185732207603
reward: 0.15555426035629563
reward: 0.18460344816777807
reward: 0.21355378880230028
reward: 0.24238846506494918
reward: 0.27108924407605633
reward: 0.29963621564853904
reward: 0.3280074927594987
reward: 0.3561788660863451
reward: 0.38412340295859776
reward: 0.4118109790656327
reward: 0.43920772876331643
reward: 0.4662753967199061
reward: 0.4929705697853294
reward: 0.5192437631735237
reward: 0.5450383290875392
reward: 0.5702891485147447
reward: 0.594921057743694
reward: 0.6188469498202306
reward: 0.6419654772411671
reward: 0.6641582652432145
reward: 0.6852865247113046
reward: 0.7051869298440949
reward: 0.723666598602053
reward: 0.7404969849342331
reward: 0.7554064639915476
reward: 0.7680713714685242
reward: 0.7781052581206319
reward: 0.7850461623129839
reward: 0.788341824794505
reward: 0.7873330326900789
reward: 0.781235781139027
reward: 0.76912382

eval cumulative reward: -99.9337 (init: -99.9337), eval step-count: 102.0, average reward=-0.0638 (init=-0.0638), step count (max): 890.0, lr policy:  0.0003:   0%|          | 100000/100000000 [00:13<1:25:08, 19555.93it/s]

reward: -0.7502472417202063
reward: -0.7895610023750501
reward: -0.8285980804670945
reward: -0.8673541080598512
reward: -0.9058260344427205
reward: -0.9440120677709204
reward: -0.9819116096862028
reward: -1.0195251843183126
reward: -1.05685436301681
reward: -1.0939016860908832
reward: -1.13067058274576
reward: -1.167165290302716
reward: -1.1642345331584651
reward: -1.1603903464970413
reward: -1.1557370721233753
reward: -1.1503732714150292
reward: -1.1443920952099516
reward: -1.1378816434034877
reward: -1.1309253111393702
reward: -1.1236021194040813
reward: -1.11598702857177
reward: -1.1081512340287418
reward: -1.1001624434593982
reward: -1.092085135722652
reward: -1.0839974878795524
reward: -1.0759378207779455
reward: -1.0279585543549847
reward: -0.9799745225778094
reward: -0.9319874817571987
reward: -0.8839991098344256
reward: -0.8359991703677957
reward: -0.787999228797083
reward: -0.739999285308091
reward: -0.6919993400738321
reward: -0.6439993932559488
reward: -0.595999445005989
rew

eval cumulative reward: -99.9337 (init: -99.9337), eval step-count: 102.0, average reward= 0.0458 (init=-0.0638), step count (max): 1206.0, lr policy:  0.0003:   0%|          | 200000/100000000 [00:26<1:27:54, 18921.58it/s]

reward: -0.09798293571531376
reward: -0.1027211942896971
reward: -0.10701732065562868
reward: -0.11092585115978232
reward: -0.11450125874524794
reward: -0.11779791957728741
reward: -0.12087008460958247
reward: -0.1237718552895217
reward: -0.12655716260426467
reward: -0.1292797486763044
reward: -0.12848441607090452
reward: -0.15795410883818303
reward: -0.18742758133704002
reward: -0.21690974883444816
reward: -0.24640547370334462
reward: -0.2776586288908998
reward: -0.3089209812562523
reward: -0.3401934924667903
reward: -0.3714770897307293
reward: -0.4027726630625855
reward: -0.4340810627735963
reward: -0.46540309719856865
reward: -0.4508490627971887
reward: -0.4363826503105364
reward: -0.42202198073999286
reward: -0.4077850168955728
reward: -0.39368959314775703
reward: -0.379753443740495
reward: -0.36599422970456497
reward: -0.352429564403605
reward: -0.339077037738049
reward: -0.32595423902492526
reward: -0.3130787785640419
reward: -0.3004683078934948
reward: -0.25616968697048625
rewar

eval cumulative reward: -99.9337 (init: -99.9337), eval step-count: 102.0, average reward= 0.0090 (init=-0.0638), step count (max): 816.0, lr policy:  0.0003:   0%|          | 300000/100000000 [00:39<1:25:41, 19389.42it/s] 

reward: -0.5269630348835913
reward: -0.5533217965882138
reward: -0.5795668294625271
reward: -1.6057044640625109
reward: 0.009610188738018003
reward: 0.03921698176525426
reward: 0.06881782251074949
reward: 0.09841013869508315
reward: 0.12799133568511412
reward: 0.15755878974495105
reward: 0.18710984115164714
reward: 0.21664178714315246
reward: 0.2461518746648612
reward: 0.2756372928796482
reward: 0.3050951654045868
reward: 0.30991549704121596
reward: 0.3147879079263342
reward: 0.3196940676218198
reward: 0.3246154735309125
reward: 0.32953344084696407
reward: 0.3344290921976647
reward: 0.3392833470435297
reward: 0.3440769108930067
reward: 0.3487902644005708
reward: 0.3534036524186541
reward: 0.3578970730792177
reward: 0.38420827760949816
reward: 0.41058765644896894
reward: 0.43702883489729105
reward: 0.4635254511991438
reward: 0.49007114168068955
reward: 0.5166595255077454
reward: 0.5432841889962692
reward: 0.5699386694009553
reward: 0.5966164381023471
reward: 0.6233108831068331
reward: 0

eval cumulative reward: -99.9337 (init: -99.9337), eval step-count: 102.0, average reward= 0.0076 (init=-0.0638), step count (max): 619.0, lr policy:  0.0003:   0%|          | 400000/100000000 [00:51<1:25:07, 19498.88it/s]

reward: -0.16643128694271753
reward: -0.1775609875093782
reward: -0.18881061072207814
reward: -0.20024107282943426
reward: -0.21191169199960655
reward: -0.22387999548200618
reward: -0.23620153079615486
reward: -0.24892968030530047
reward: -0.2621154789264776
reward: -0.21995145619881715
reward: -0.19105991562573188
reward: -0.16237926568639874
reward: -0.1339237041660504
reward: -0.10570858535844876
reward: -0.07775052917559266
reward: -0.050067541562504925
reward: -0.022679147447325736
reward: 0.004393462416917862
reward: 0.031127269227514274
reward: 0.05749724689014969
reward: 0.027062052021409756
reward: -0.003909603927228111
reward: -0.035426633920764894
reward: -0.06749436426984758
reward: -0.10011442486971193
reward: -0.13328469029994489
reward: -0.166999274893082
reward: -0.2012485835614019
reward: -0.23601941870871274
reward: -0.2712951420056023
reward: -0.30705588823990576
reward: -0.34327882694150014
reward: -0.37993846609469983
reward: -0.36059823967510996
reward: -0.3418133

eval cumulative reward: -99.9337 (init: -99.9337), eval step-count: 102.0, average reward=-0.0140 (init=-0.0638), step count (max): 901.0, lr policy:  0.0003:   0%|          | 500000/100000000 [01:04<1:24:39, 19589.04it/s]

reward: 0.4883316201202409
reward: 0.446756943072828
reward: 0.4051963799929319
reward: 0.36362426598569614
reward: 0.3220142991694226
reward: 0.33674264373088175
reward: 0.3513566161767416
reward: 0.36581769685339144
reward: 0.38008604639872295
reward: 0.3941203472776158
reward: 0.40787763689882195
reward: 0.42131313211554033
reward: 0.43438004501397326
reward: 0.4470293900334714
reward: 0.4592097826465952
reward: 0.4708672300669532
reward: 0.425805282122227
reward: 0.3802518862819028
reward: 0.340561427484795
reward: 0.3068564299185051
reward: 0.2728212684604657
reward: 0.2384532795240262
reward: 0.20375145266241423
reward: 0.16871649391252244
reward: 0.13335087201025334
reward: 0.09765884584478275
reward: 0.06164647181920198
reward: 0.02532159014331455
reward: 0.001076565947631395
reward: -0.022092675666379926
reward: -0.04426904717441256
reward: -0.06553876496286762
reward: -0.0859907785250785
reward: -0.10571622430578681
reward: -0.12480790654443179
reward: -0.14335980623872996
re

eval cumulative reward: -99.9337 (init: -99.9337), eval step-count: 102.0, average reward=-0.0140 (init=-0.0638), step count (max): 901.0, lr policy:  0.0003:   1%|          | 600000/100000000 [01:16<1:25:25, 19394.66it/s]

reward: 0.008208933832150024
reward: 0.03641458110176363
reward: 0.0646144712538128
reward: 0.09280612470440132
reward: 0.12098704904561618
reward: 0.1491547352161585
reward: 0.17730665362754827
reward: 0.20544025023552462
reward: 0.23355294254604783
reward: 0.2616421155450571
reward: 0.28970511754083667
reward: 0.3177392559075026
reward: 0.3457417927177325
reward: 0.3737099402524228
reward: 0.4016408563744741
reward: 0.42953163975335973
reward: 0.45737932492654343
reward: 0.48518087718315417
reward: 0.5129331872546107
reward: 0.5406330657961126
reward: 0.5682772376420574
reward: 0.5958623358175286
reward: 0.6233848952869896
reward: 0.6508413464202463
reward: 0.6782280081545545
reward: 0.7055410808304976
reward: 0.7327766386778807
reward: 0.759930621926422
reward: 0.7869988285144337
reward: 0.8139769053669704
reward: 0.8408603392130924
reward: 0.8676444469099032
reward: 0.8943243652389059
reward: 0.9208950401379296
reward: 0.9473512153294311
reward: 0.9736874203033437
reward: 0.9998979

eval cumulative reward:  141.1579 (init: -99.9337), eval step-count: 125.0, average reward=-0.0342 (init=-0.0638), step count (max): 939.0, lr policy:  0.0003:   1%|          | 600000/100000000 [01:18<1:25:25, 19394.66it/s]

reward: 0.3905984208904597
reward: 0.35648554344531264
reward: 0.3018526084774122
reward: 0.24791854867276678
reward: 0.1946999570118199
reward: 0.142210905447862
reward: 0.09046287978161857
reward: 0.03946473613844122
reward: -0.010777321017656272
reward: -0.06025973971753712
reward: -0.10898160084852675
reward: -0.1569445789919895
reward: -0.20415288491905387
reward: -0.23603946298485898
reward: -0.2684606348223836
reward: -0.3014135378714224
reward: -0.33489413404006574
reward: -0.36889721761253663
reward: -0.40341642923646537
reward: -0.438444276022983
reward: -0.47397215773069584
reward: -0.5099903989405401
reward: -0.5464882870637842
reward: -0.5834541159611889
reward: -0.5650518261817566
reward: -0.546841435484391
reward: -0.5288354143365874
reward: -0.5110462660118787
reward: -0.493486545319756
reward: -0.4761688764026153
reward: -0.45910596964039113
reward: -0.44231063769572243
reward: -0.42579581072497813
reward: -0.4095745507731574
reward: -0.39366006536356635
reward: -0.378

eval cumulative reward:  141.1579 (init: -99.9337), eval step-count: 125.0, average reward= 0.0592 (init=-0.0638), step count (max): 764.0, lr policy:  0.0003:   1%|          | 700000/100000000 [01:31<1:28:29, 18700.70it/s]

reward: -0.6004449304382969
reward: -0.578665727665124
reward: -0.5569883862875185
reward: -0.5401223013079058
reward: -0.523362024234635
reward: -0.5067241112787289
reward: -0.4902249654269478
reward: -0.5236138969081233
reward: -0.5570059563119428
reward: -0.5904004947974405
reward: -0.6237968706185513
reward: -0.6571944512989485
reward: -0.6905926156948254
reward: -0.7239907559354622
reward: -0.757388279232887
reward: -0.7907846095534448
reward: -0.824179189145589
reward: -0.8575714799197085
reward: -0.858802218986582
reward: -0.8598901428248963
reward: -0.8608612307583454
reward: -0.8617407163366188
reward: -0.8625531099333773
reward: -0.8633222210638368
reward: -0.8640711802676344
reward: -0.864822460425835
reward: -0.8655978974016869
reward: -0.8664187099132533
reward: -0.8673055185625689
reward: -0.896093376293476
reward: -0.92484761298059
reward: -0.9535741262616957
reward: -0.9822786896862578
reward: -1.0109669411427893
reward: -1.0396443723278133
reward: -1.0683163192512757
r

KeyboardInterrupt: 

eval cumulative reward:  141.1579 (init: -99.9337), eval step-count: 125.0, average reward= 0.0592 (init=-0.0638), step count (max): 764.0, lr policy:  0.0003:   1%|          | 700000/100000000 [01:50<1:28:29, 18700.70it/s]

: 

In [None]:
plt.figure(figsize=(10, 10))
plt.subplot(2, 2, 1)
plt.plot(logs["reward"])
plt.title("training rewards (average)")
plt.subplot(2, 2, 2)
plt.plot(logs["step_count"])
plt.title("Max step count (training)")
plt.subplot(2, 2, 3)
plt.plot(logs["eval reward (sum)"])
plt.title("Return (test)")
plt.subplot(2, 2, 4)
plt.plot(logs["eval step_count"])
plt.title("Max step count (test)")
plt.show()