In [1]:
import torch
import gym
from garage import wrap_experiment
from garage.envs import GymEnv
from garage.experiment.deterministic import set_seed
from garage.sampler import LocalSampler
from garage.torch.algos import TRPO

# from garage.torch.policies import GaussianMLPPolicy
from policies.gaussian_mlp_policy import GaussianMLPPolicy

from garage.torch.value_functions import GaussianMLPValueFunction
from garage.trainer import Trainer

from TRPO_DRSOM import TRPO_DRSOM

In [None]:
@wrap_experiment(log_dir='drsom_test/momentum-opt-NP}+G')
def trpo_pendulum(ctxt=None, seed=1):
    set_seed(seed)

    env = GymEnv('MountainCarContinuous-v0')
    # env = GymEnv('Pendulum-v0')

    trainer = Trainer(ctxt)
    policy = GaussianMLPPolicy(env.spec,
                               hidden_sizes=[32, 32],
                               hidden_nonlinearity=torch.tanh,
                               output_nonlinearity=None)


    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)
    sampler = LocalSampler(agents=policy,
                           envs=env,
                           max_episode_length=env.spec.max_episode_length)
    algo = TRPO_DRSOM(env_spec=env.spec,
                      policy=policy,
                      value_function=value_function,
                      sampler=sampler,
                      discount=0.99,
                      center_adv=False)
    trainer.setup(algo, env)
    trainer.train(n_epochs=200, batch_size=1024)

In [3]:
for i in range(2.):
    trpo_pendulum(seed=1234)

2022-09-02 00:54:50 | [trpo_pendulum] Logging to drsom_test/momentum-opt-NPG




2022-09-02 00:54:51 | [trpo_pendulum] Obtaining samples...
old policy para is
tensor([ 0.0000, -0.0881,  0.1847,  ..., -0.2844,  0.1172,  0.0000])
----------------------------------
params now is:
tensor([ 0.0000, -0.0881,  0.1847,  ..., -0.2844,  0.1172,  0.0000])
----------------------------------
g norm is:
tensor(0.7056)
-------------------
G is:
tensor([[2.2476e+00, 1.6441e-08],
        [1.6441e-08, 1.0483e-15]])
-------------------
alpha is: 
tensor([-5.0000e-01, -3.6575e-09])
-------------------
decent step is:
tensor([ 0.0197, -0.0448,  0.0171,  ...,  0.0346,  0.1038,  1.1073])
--------------------
loss before is:
tensor(8.6540, grad_fn=<NegBackward>)
--------------------
loss now is:
tensor(9.3054, grad_fn=<NegBackward>)
--------------------
2022-09-02 00:54:52 | [trpo_pendulum] epoch #0 | Saving snapshot...
2022-09-02 00:54:52 | [trpo_pendulum] epoch #0 | Saved
2022-09-02 00:54:52 | [trpo_pendulum] epoch #0 | Time 1.22 s
2022-09-02 00:54:52 | [trpo_pendulum] epoch #0 | EpochT

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:05 | [trpo_pendulum] epoch #35 | Saving snapshot...
2022-09-02 00:59:05 | [trpo_pendulum] epoch #35 | Saved
2022-09-02 00:59:05 | [trpo_pendulum] epoch #35 | Time 39.11 s
2022-09-02 00:59:05 | [trpo_pendulum] epoch #35 | EpochTime 1.10 s
----------------------------------  -----
Evaluation/AverageDiscountedReturn    nan
Evaluation/AverageReturn              nan
Evaluation/Iteration                   35
Evaluation/MaxReturn                  nan
Evaluation/MinReturn                  nan
Evaluation/NumEpisodes                  2
Evaluation/StdReturn                  nan
Evaluation/TerminationRate              0
TotalEnvSteps                       71928
----------------------------------  -----
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [nan, nan]])

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:06 | [trpo_pendulum] epoch #36 | Saving snapshot...
2022-09-02 00:59:06 | [trpo_pendulum] epoch #36 | Saved
2022-09-02 00:59:06 | [trpo_pendulum] epoch #36 | Time 40.37 s
2022-09-02 00:59:06 | [trpo_pendulum] epoch #36 | EpochTime 1.25 s
----------------------------------  -----
Evaluation/AverageDiscountedReturn    nan
Evaluation/AverageReturn              nan
Evaluation/Iteration                   36
Evaluation/MaxReturn                  nan
Evaluation/MinReturn                  nan
Evaluation/NumEpisodes                  2
Evaluation/StdReturn                  nan
Evaluation/TerminationRate              0
TotalEnvSteps                       73926
----------------------------------  -----
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [nan, nan]])

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:07 | [trpo_pendulum] epoch #37 | Saving snapshot...
2022-09-02 00:59:08 | [trpo_pendulum] epoch #37 | Saved
2022-09-02 00:59:08 | [trpo_pendulum] epoch #37 | Time 41.44 s
2022-09-02 00:59:08 | [trpo_pendulum] epoch #37 | EpochTime 1.06 s
----------------------------------  -----
Evaluation/AverageDiscountedReturn    nan
Evaluation/AverageReturn              nan
Evaluation/Iteration                   37
Evaluation/MaxReturn                  nan
Evaluation/MinReturn                  nan
Evaluation/NumEpisodes                  2
Evaluation/StdReturn                  nan
Evaluation/TerminationRate              0
TotalEnvSteps                       75924
----------------------------------  -----
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [nan, nan]])

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:09 | [trpo_pendulum] epoch #38 | Saving snapshot...
2022-09-02 00:59:09 | [trpo_pendulum] epoch #38 | Saved
2022-09-02 00:59:09 | [trpo_pendulum] epoch #38 | Time 42.55 s
2022-09-02 00:59:09 | [trpo_pendulum] epoch #38 | EpochTime 1.11 s
----------------------------------  -----
Evaluation/AverageDiscountedReturn    nan
Evaluation/AverageReturn              nan
Evaluation/Iteration                   38
Evaluation/MaxReturn                  nan
Evaluation/MinReturn                  nan
Evaluation/NumEpisodes                  2
Evaluation/StdReturn                  nan
Evaluation/TerminationRate              0
TotalEnvSteps                       77922
----------------------------------  -----
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [nan, nan]])

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:10 | [trpo_pendulum] epoch #39 | Saving snapshot...
2022-09-02 00:59:10 | [trpo_pendulum] epoch #39 | Saved
2022-09-02 00:59:10 | [trpo_pendulum] epoch #39 | Time 43.64 s
2022-09-02 00:59:10 | [trpo_pendulum] epoch #39 | EpochTime 1.08 s
----------------------------------  -----
Evaluation/AverageDiscountedReturn    nan
Evaluation/AverageReturn              nan
Evaluation/Iteration                   39
Evaluation/MaxReturn                  nan
Evaluation/MinReturn                  nan
Evaluation/NumEpisodes                  2
Evaluation/StdReturn                  nan
Evaluation/TerminationRate              0
TotalEnvSteps                       79920
----------------------------------  -----
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [nan, nan]])

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:11 | [trpo_pendulum] epoch #40 | Saving snapshot...
2022-09-02 00:59:11 | [trpo_pendulum] epoch #40 | Saved
2022-09-02 00:59:11 | [trpo_pendulum] epoch #40 | Time 44.80 s
2022-09-02 00:59:11 | [trpo_pendulum] epoch #40 | EpochTime 1.16 s
----------------------------------  -----
Evaluation/AverageDiscountedReturn    nan
Evaluation/AverageReturn              nan
Evaluation/Iteration                   40
Evaluation/MaxReturn                  nan
Evaluation/MinReturn                  nan
Evaluation/NumEpisodes                  2
Evaluation/StdReturn                  nan
Evaluation/TerminationRate              0
TotalEnvSteps                       81918
----------------------------------  -----
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [nan, nan]])

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:12 | [trpo_pendulum] epoch #41 | Saving snapshot...
2022-09-02 00:59:12 | [trpo_pendulum] epoch #41 | Saved
2022-09-02 00:59:12 | [trpo_pendulum] epoch #41 | Time 45.92 s
2022-09-02 00:59:12 | [trpo_pendulum] epoch #41 | EpochTime 1.11 s
----------------------------------  -----
Evaluation/AverageDiscountedReturn    nan
Evaluation/AverageReturn              nan
Evaluation/Iteration                   41
Evaluation/MaxReturn                  nan
Evaluation/MinReturn                  nan
Evaluation/NumEpisodes                  2
Evaluation/StdReturn                  nan
Evaluation/TerminationRate              0
TotalEnvSteps                       83916
----------------------------------  -----
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [nan, nan]])

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:13 | [trpo_pendulum] epoch #42 | Saving snapshot...
2022-09-02 00:59:13 | [trpo_pendulum] epoch #42 | Saved
2022-09-02 00:59:13 | [trpo_pendulum] epoch #42 | Time 47.07 s
2022-09-02 00:59:13 | [trpo_pendulum] epoch #42 | EpochTime 1.14 s
----------------------------------  -----
Evaluation/AverageDiscountedReturn    nan
Evaluation/AverageReturn              nan
Evaluation/Iteration                   42
Evaluation/MaxReturn                  nan
Evaluation/MinReturn                  nan
Evaluation/NumEpisodes                  2
Evaluation/StdReturn                  nan
Evaluation/TerminationRate              0
TotalEnvSteps                       85914
----------------------------------  -----
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [nan, nan]])

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:14 | [trpo_pendulum] epoch #43 | Saving snapshot...
2022-09-02 00:59:14 | [trpo_pendulum] epoch #43 | Saved
2022-09-02 00:59:14 | [trpo_pendulum] epoch #43 | Time 48.21 s
2022-09-02 00:59:14 | [trpo_pendulum] epoch #43 | EpochTime 1.14 s
----------------------------------  -----
Evaluation/AverageDiscountedReturn    nan
Evaluation/AverageReturn              nan
Evaluation/Iteration                   43
Evaluation/MaxReturn                  nan
Evaluation/MinReturn                  nan
Evaluation/NumEpisodes                  2
Evaluation/StdReturn                  nan
Evaluation/TerminationRate              0
TotalEnvSteps                       87912
----------------------------------  -----
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [nan, nan]])

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:15 | [trpo_pendulum] epoch #44 | Saving snapshot...
2022-09-02 00:59:15 | [trpo_pendulum] epoch #44 | Saved
2022-09-02 00:59:15 | [trpo_pendulum] epoch #44 | Time 49.27 s
2022-09-02 00:59:15 | [trpo_pendulum] epoch #44 | EpochTime 1.05 s
----------------------------------  -----
Evaluation/AverageDiscountedReturn    nan
Evaluation/AverageReturn              nan
Evaluation/Iteration                   44
Evaluation/MaxReturn                  nan
Evaluation/MinReturn                  nan
Evaluation/NumEpisodes                  2
Evaluation/StdReturn                  nan
Evaluation/TerminationRate              0
TotalEnvSteps                       89910
----------------------------------  -----
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [nan, nan]])

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:16 | [trpo_pendulum] epoch #45 | Saving snapshot...
2022-09-02 00:59:16 | [trpo_pendulum] epoch #45 | Saved
2022-09-02 00:59:16 | [trpo_pendulum] epoch #45 | Time 50.38 s
2022-09-02 00:59:16 | [trpo_pendulum] epoch #45 | EpochTime 1.11 s
----------------------------------  -----
Evaluation/AverageDiscountedReturn    nan
Evaluation/AverageReturn              nan
Evaluation/Iteration                   45
Evaluation/MaxReturn                  nan
Evaluation/MinReturn                  nan
Evaluation/NumEpisodes                  2
Evaluation/StdReturn                  nan
Evaluation/TerminationRate              0
TotalEnvSteps                       91908
----------------------------------  -----
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [nan, nan]])

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:17 | [trpo_pendulum] epoch #46 | Saving snapshot...
2022-09-02 00:59:18 | [trpo_pendulum] epoch #46 | Saved
2022-09-02 00:59:18 | [trpo_pendulum] epoch #46 | Time 51.41 s
2022-09-02 00:59:18 | [trpo_pendulum] epoch #46 | EpochTime 1.03 s
----------------------------------  -----
Evaluation/AverageDiscountedReturn    nan
Evaluation/AverageReturn              nan
Evaluation/Iteration                   46
Evaluation/MaxReturn                  nan
Evaluation/MinReturn                  nan
Evaluation/NumEpisodes                  2
Evaluation/StdReturn                  nan
Evaluation/TerminationRate              0
TotalEnvSteps                       93906
----------------------------------  -----
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [nan, nan]])

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:19 | [trpo_pendulum] epoch #47 | Saving snapshot...
2022-09-02 00:59:19 | [trpo_pendulum] epoch #47 | Saved
2022-09-02 00:59:19 | [trpo_pendulum] epoch #47 | Time 52.52 s
2022-09-02 00:59:19 | [trpo_pendulum] epoch #47 | EpochTime 1.10 s
----------------------------------  -----
Evaluation/AverageDiscountedReturn    nan
Evaluation/AverageReturn              nan
Evaluation/Iteration                   47
Evaluation/MaxReturn                  nan
Evaluation/MinReturn                  nan
Evaluation/NumEpisodes                  2
Evaluation/StdReturn                  nan
Evaluation/TerminationRate              0
TotalEnvSteps                       95904
----------------------------------  -----
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [nan, nan]])

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:20 | [trpo_pendulum] epoch #48 | Saving snapshot...
2022-09-02 00:59:20 | [trpo_pendulum] epoch #48 | Saved
2022-09-02 00:59:20 | [trpo_pendulum] epoch #48 | Time 53.70 s
2022-09-02 00:59:20 | [trpo_pendulum] epoch #48 | EpochTime 1.17 s
----------------------------------  -----
Evaluation/AverageDiscountedReturn    nan
Evaluation/AverageReturn              nan
Evaluation/Iteration                   48
Evaluation/MaxReturn                  nan
Evaluation/MinReturn                  nan
Evaluation/NumEpisodes                  2
Evaluation/StdReturn                  nan
Evaluation/TerminationRate              0
TotalEnvSteps                       97902
----------------------------------  -----
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [nan, nan]])

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:21 | [trpo_pendulum] epoch #49 | Saving snapshot...
2022-09-02 00:59:21 | [trpo_pendulum] epoch #49 | Saved
2022-09-02 00:59:21 | [trpo_pendulum] epoch #49 | Time 54.91 s
2022-09-02 00:59:21 | [trpo_pendulum] epoch #49 | EpochTime 1.21 s
----------------------------------  -----
Evaluation/AverageDiscountedReturn    nan
Evaluation/AverageReturn              nan
Evaluation/Iteration                   49
Evaluation/MaxReturn                  nan
Evaluation/MinReturn                  nan
Evaluation/NumEpisodes                  2
Evaluation/StdReturn                  nan
Evaluation/TerminationRate              0
TotalEnvSteps                       99900
----------------------------------  -----
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [nan, nan]])

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:22 | [trpo_pendulum] epoch #50 | Saving snapshot...
2022-09-02 00:59:22 | [trpo_pendulum] epoch #50 | Saved
2022-09-02 00:59:22 | [trpo_pendulum] epoch #50 | Time 56.09 s
2022-09-02 00:59:22 | [trpo_pendulum] epoch #50 | EpochTime 1.18 s
----------------------------------  ------
Evaluation/AverageDiscountedReturn     nan
Evaluation/AverageReturn               nan
Evaluation/Iteration                    50
Evaluation/MaxReturn                   nan
Evaluation/MinReturn                   nan
Evaluation/NumEpisodes                   2
Evaluation/StdReturn                   nan
Evaluation/TerminationRate               0
TotalEnvSteps                       101898
----------------------------------  ------
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:23 | [trpo_pendulum] epoch #51 | Saving snapshot...
2022-09-02 00:59:23 | [trpo_pendulum] epoch #51 | Saved
2022-09-02 00:59:23 | [trpo_pendulum] epoch #51 | Time 57.16 s
2022-09-02 00:59:23 | [trpo_pendulum] epoch #51 | EpochTime 1.07 s
----------------------------------  ------
Evaluation/AverageDiscountedReturn     nan
Evaluation/AverageReturn               nan
Evaluation/Iteration                    51
Evaluation/MaxReturn                   nan
Evaluation/MinReturn                   nan
Evaluation/NumEpisodes                   2
Evaluation/StdReturn                   nan
Evaluation/TerminationRate               0
TotalEnvSteps                       103896
----------------------------------  ------
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:24 | [trpo_pendulum] epoch #52 | Saving snapshot...
2022-09-02 00:59:24 | [trpo_pendulum] epoch #52 | Saved
2022-09-02 00:59:24 | [trpo_pendulum] epoch #52 | Time 58.36 s
2022-09-02 00:59:24 | [trpo_pendulum] epoch #52 | EpochTime 1.19 s
----------------------------------  ------
Evaluation/AverageDiscountedReturn     nan
Evaluation/AverageReturn               nan
Evaluation/Iteration                    52
Evaluation/MaxReturn                   nan
Evaluation/MinReturn                   nan
Evaluation/NumEpisodes                   2
Evaluation/StdReturn                   nan
Evaluation/TerminationRate               0
TotalEnvSteps                       105894
----------------------------------  ------
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:25 | [trpo_pendulum] epoch #53 | Saving snapshot...
2022-09-02 00:59:25 | [trpo_pendulum] epoch #53 | Saved
2022-09-02 00:59:25 | [trpo_pendulum] epoch #53 | Time 59.40 s
2022-09-02 00:59:25 | [trpo_pendulum] epoch #53 | EpochTime 1.04 s
----------------------------------  ------
Evaluation/AverageDiscountedReturn     nan
Evaluation/AverageReturn               nan
Evaluation/Iteration                    53
Evaluation/MaxReturn                   nan
Evaluation/MinReturn                   nan
Evaluation/NumEpisodes                   2
Evaluation/StdReturn                   nan
Evaluation/TerminationRate               0
TotalEnvSteps                       107892
----------------------------------  ------
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:27 | [trpo_pendulum] epoch #54 | Saving snapshot...
2022-09-02 00:59:27 | [trpo_pendulum] epoch #54 | Saved
2022-09-02 00:59:27 | [trpo_pendulum] epoch #54 | Time 60.57 s
2022-09-02 00:59:27 | [trpo_pendulum] epoch #54 | EpochTime 1.16 s
----------------------------------  ------
Evaluation/AverageDiscountedReturn     nan
Evaluation/AverageReturn               nan
Evaluation/Iteration                    54
Evaluation/MaxReturn                   nan
Evaluation/MinReturn                   nan
Evaluation/NumEpisodes                   2
Evaluation/StdReturn                   nan
Evaluation/TerminationRate               0
TotalEnvSteps                       109890
----------------------------------  ------
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:28 | [trpo_pendulum] epoch #55 | Saving snapshot...
2022-09-02 00:59:28 | [trpo_pendulum] epoch #55 | Saved
2022-09-02 00:59:28 | [trpo_pendulum] epoch #55 | Time 61.62 s
2022-09-02 00:59:28 | [trpo_pendulum] epoch #55 | EpochTime 1.05 s
----------------------------------  ------
Evaluation/AverageDiscountedReturn     nan
Evaluation/AverageReturn               nan
Evaluation/Iteration                    55
Evaluation/MaxReturn                   nan
Evaluation/MinReturn                   nan
Evaluation/NumEpisodes                   2
Evaluation/StdReturn                   nan
Evaluation/TerminationRate               0
TotalEnvSteps                       111888
----------------------------------  ------
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:29 | [trpo_pendulum] epoch #56 | Saving snapshot...
2022-09-02 00:59:29 | [trpo_pendulum] epoch #56 | Saved
2022-09-02 00:59:29 | [trpo_pendulum] epoch #56 | Time 62.69 s
2022-09-02 00:59:29 | [trpo_pendulum] epoch #56 | EpochTime 1.07 s
----------------------------------  ------
Evaluation/AverageDiscountedReturn     nan
Evaluation/AverageReturn               nan
Evaluation/Iteration                    56
Evaluation/MaxReturn                   nan
Evaluation/MinReturn                   nan
Evaluation/NumEpisodes                   2
Evaluation/StdReturn                   nan
Evaluation/TerminationRate               0
TotalEnvSteps                       113886
----------------------------------  ------
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.


2022-09-02 00:59:30 | [trpo_pendulum] epoch #57 | Saving snapshot...
2022-09-02 00:59:30 | [trpo_pendulum] epoch #57 | Saved
2022-09-02 00:59:30 | [trpo_pendulum] epoch #57 | Time 63.81 s
2022-09-02 00:59:30 | [trpo_pendulum] epoch #57 | EpochTime 1.11 s
----------------------------------  ------
Evaluation/AverageDiscountedReturn     nan
Evaluation/AverageReturn               nan
Evaluation/Iteration                    57
Evaluation/MaxReturn                   nan
Evaluation/MinReturn                   nan
Evaluation/NumEpisodes                   2
Evaluation/StdReturn                   nan
Evaluation/TerminationRate               0
TotalEnvSteps                       115884
----------------------------------  ------
old policy para is
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
params now is:
tensor([nan, nan, nan,  ..., nan, nan, nan])
----------------------------------
g norm is:
tensor(nan)
-------------------
G is:
tensor([[nan, nan],
        [

KeyboardInterrupt: 