In [1]:
# Imports
from stable_baselines3 import PPO, DQN
import stable_baselines3.ppo.policies as ppo
import stable_baselines3.dqn.policies as dqn
from stable_baselines3.common.env_util import make_vec_env
import gym
import numpy as np
import random 
from gym.envs.box2d.lunar_lander import heuristic as lunar_heuristic
from stable_baselines3.common.monitor import Monitor
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from stable_baselines3.common.vec_env import DummyVecEnv
from imitation.algorithms import bc
from stable_baselines3.common.evaluation import evaluate_policy
from parameter_initialization_deep_rl.common.helpers import create_folder
from stable_baselines3.common.callbacks import EvalCallback
from torch.nn.modules.activation import Sigmoid, Tanh, ReLU, LeakyReLU
from torch import nn
import torch as th

In [2]:
def heuristic_naive(obs: np.ndarray):
    obs = obs[0]
    action = 0
    if obs[1] < 0.3:
        if obs[3] < -0.05:
            action = 2
        else:
            action = 0
    else:
        a = random.random()
        if a < 0.4 and obs[3] < 0:
            action = 2
        else:
            if obs[4] < -0.1:
                action = 1
            elif obs[4] > 0.1:
                action = 3
    action = np.full((1),action)
    return action

def heuristic_expert(o: np.ndarray):
    # o is single observation and of shape (1,8)
    # action a has to be of shape (1)
    a = np.full((1),lunar_heuristic(gym.make("LunarLander-v2"), o[0]))
    return a

In [3]:
seeds = [95,  39,   1,  38,  83,  38,  36,  50,  14, 100,  67,  34,  68, 47,  70,  57,  64,  98,  93,  25]

In [4]:
log_dir = "./logs/dqn/bc_naive"

In [5]:
# Hyperparameter
hyperparameter = dict(
    n_steps = 16,
    gae_lambda = 0.98,
    gamma = 0.99,
    n_epochs = 4,
    ent_coef = 0.0,
)

In [6]:
hyperparameter = dict(
    learning_rate = 6.3e-4,
    batch_size = 128,
    buffer_size = 50000,
    learning_starts = 0,
    gamma = 0.99,
    target_update_interval = 250,
    train_freq = 4,
    gradient_steps = -1,
    exploration_fraction = 0.12,
    exploration_final_eps = 0.1,
)

In [None]:
reward_sum = 0

for seed in seeds:
    # create environments and set seed
    env_train = gym.make("LunarLander-v2")
    env_train.seed(seed)
    env_test = Monitor(gym.make("LunarLander-v2"))
    env_test.seed(seed)
    
    # Collect rollouts using the expert
    rollouts = rollout.rollout(
        heuristic_naive,
        DummyVecEnv([lambda: RolloutInfoWrapper(env_train)]),
        rollout.make_sample_until(min_timesteps=None, min_episodes=200),
    )
    # Flatten the trajectories to obtain individual transitions
    transitions = rollout.flatten_trajectories(rollouts)
    # Set up student
    student = DQN(
        policy=dqn.MlpPolicy,
        env=env_train,
        policy_kwargs = dict(
            net_arch = [32,32]
        ),
        seed=seed,
        verbose=False,
        **hyperparameter,
    )
    # Get the layers of the students q_net
    student_policy = student.policy
    student_layers = [module for module in student.q_net.modules(
        ) if isinstance(module, nn.Linear)]
    # Set up behavior cloning agent
    bc_trainer = bc.BC(
        observation_space=env_train.observation_space,
        action_space=env_train.action_space,
        demonstrations=transitions,
    )
    # Pretrain the policy with behavior cloning
    bc_trainer.train(n_epochs=2)
    # Get network parameters of bc expert after training
    policy = bc_trainer.policy
    expert_shared_layers = [module for module in policy.mlp_extractor.modules(
        ) if isinstance(module, nn.Linear)]
    expert_output_layers = [module for module in policy.action_net.modules(
        ) if isinstance(module, nn.Linear)]
    expert_layers = expert_shared_layers + expert_output_layers
    # Copy expert parameters to student q_net of student
    for (student_layer, expert_layer) in zip(student_layers, expert_layers):
        with th.no_grad():
            student_layer.weight.copy_(expert_layer.weight)
            student_layer.bias.copy_(expert_layer.bias)
    student_policy.q_net_target.load_state_dict(student_policy.q_net.state_dict())
    # Test performance before training (after pretraining)
    reward_before_training, _ = evaluate_policy(student.policy, env_test, 20)
    reward_sum += reward_before_training
    print(f"0 Timesteps: {reward_before_training}")
    # Create log folder
    trial_log_dir = create_folder(f"{log_dir}/{seed}")
    # Create eval callback
    eval_callback = EvalCallback(
        env_test,
        log_path=trial_log_dir,
        n_eval_episodes=20,
        eval_freq=3750,
        deterministic=True,
        render=False
    )
    # Train the pretrained policy using the regular learning algorithm
    student.learn(
        total_timesteps=1.5e5,
        callback=eval_callback
    )
np.savez(f"{log_dir}/zero_scores", scores=[reward_sum])

0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00139 |
|    entropy        | 1.39     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 40.5     |
|    loss           | 1.39     |
|    neglogp        | 1.39     |
|    prob_true_act  | 0.25     |
|    samples_so_far | 32       |
--------------------------------


494batch [00:01, 396.81batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500       |
|    ent_loss       | -0.000849 |
|    entropy        | 0.849     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 55.4      |
|    loss           | 1.16      |
|    neglogp        | 1.16      |
|    prob_true_act  | 0.454     |
|    samples_so_far | 16032     |
---------------------------------


934batch [00:02, 391.93batch/s]
974batch [00:02, 387.61batch/s][A

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1000      |
|    ent_loss       | -0.000865 |
|    entropy        | 0.865     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 69.6      |
|    loss           | 0.878     |
|    neglogp        | 0.879     |
|    prob_true_act  | 0.494     |
|    samples_so_far | 32032     |
---------------------------------


1494batch [00:03, 395.30batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 1500     |
|    ent_loss       | -0.00073 |
|    entropy        | 0.73     |
|    epoch          | 1        |
|    l2_loss        | 0        |
|    l2_norm        | 84.4     |
|    loss           | 0.77     |
|    neglogp        | 0.77     |
|    prob_true_act  | 0.569    |
|    samples_so_far | 48032    |
--------------------------------


1894batch [00:04, 393.34batch/s]
1898batch [00:04, 388.80batch/s][A


0 Timesteps: -118.13965919999998
Eval num_timesteps=3750, episode_reward=-199.18 +/- 135.36
Episode length: 268.50 +/- 130.92
New best mean reward!
Eval num_timesteps=7500, episode_reward=-176.71 +/- 110.25
Episode length: 260.20 +/- 150.94
New best mean reward!
Eval num_timesteps=11250, episode_reward=-154.02 +/- 81.63
Episode length: 555.55 +/- 342.85
New best mean reward!
Eval num_timesteps=15000, episode_reward=-61.68 +/- 36.38
Episode length: 929.90 +/- 210.45
New best mean reward!
Eval num_timesteps=18750, episode_reward=-58.81 +/- 20.23
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=22500, episode_reward=-53.25 +/- 19.77
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=26250, episode_reward=-34.78 +/- 23.70
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=30000, episode_reward=-45.57 +/- 32.07
Episode length: 958.90 +/- 179.15
Eval num_timesteps=33750, episode_reward=-59.27 +/- 62.06
Episode length: 994

0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00139 |
|    entropy        | 1.39     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 40.5     |
|    loss           | 1.38     |
|    neglogp        | 1.39     |
|    prob_true_act  | 0.25     |
|    samples_so_far | 32       |
--------------------------------


469batch [00:01, 376.52batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500       |
|    ent_loss       | -0.000903 |
|    entropy        | 0.903     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 53.4      |
|    loss           | 0.883     |
|    neglogp        | 0.884     |
|    prob_true_act  | 0.483     |
|    samples_so_far | 16032     |
---------------------------------


897batch [00:02, 380.56batch/s]
975batch [00:02, 375.45batch/s][A

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1000      |
|    ent_loss       | -0.000856 |
|    entropy        | 0.856     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 65.9      |
|    loss           | 0.632     |
|    neglogp        | 0.633     |
|    prob_true_act  | 0.556     |
|    samples_so_far | 32032     |
---------------------------------


1475batch [00:03, 379.79batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1500      |
|    ent_loss       | -0.000846 |
|    entropy        | 0.846     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 80.3      |
|    loss           | 0.672     |
|    neglogp        | 0.673     |
|    prob_true_act  | 0.561     |
|    samples_so_far | 48032     |
---------------------------------


1819batch [00:04, 379.77batch/s]
1838batch [00:04, 372.49batch/s][A


0 Timesteps: -153.27564109999997
Eval num_timesteps=3750, episode_reward=-126.34 +/- 17.98
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=7500, episode_reward=-114.53 +/- 169.10
Episode length: 683.70 +/- 283.26
New best mean reward!
Eval num_timesteps=11250, episode_reward=-127.64 +/- 78.26
Episode length: 354.55 +/- 174.77
Eval num_timesteps=15000, episode_reward=-163.98 +/- 52.18
Episode length: 288.15 +/- 71.55
Eval num_timesteps=18750, episode_reward=-181.32 +/- 33.66
Episode length: 435.15 +/- 125.15
Eval num_timesteps=22500, episode_reward=-131.85 +/- 63.68
Episode length: 777.25 +/- 274.58
Eval num_timesteps=26250, episode_reward=-34.28 +/- 23.25
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=30000, episode_reward=-57.44 +/- 87.72
Episode length: 670.75 +/- 283.50
Eval num_timesteps=33750, episode_reward=-27.86 +/- 33.23
Episode length: 979.85 +/- 87.83
New best mean reward!
Eval num_timesteps=37500, episode_reward=-24.88 +/

0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00139 |
|    entropy        | 1.39     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 40.5     |
|    loss           | 1.39     |
|    neglogp        | 1.39     |
|    prob_true_act  | 0.25     |
|    samples_so_far | 32       |
--------------------------------


471batch [00:01, 377.39batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500       |
|    ent_loss       | -0.000873 |
|    entropy        | 0.873     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 54.3      |
|    loss           | 1.11      |
|    neglogp        | 1.11      |
|    prob_true_act  | 0.446     |
|    samples_so_far | 16032     |
---------------------------------


872batch [00:02, 339.80batch/s]
986batch [00:02, 363.40batch/s][A

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1000      |
|    ent_loss       | -0.000847 |
|    entropy        | 0.847     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 67.8      |
|    loss           | 0.833     |
|    neglogp        | 0.834     |
|    prob_true_act  | 0.541     |
|    samples_so_far | 32032     |
---------------------------------


1489batch [00:04, 380.19batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1500      |
|    ent_loss       | -0.000912 |
|    entropy        | 0.912     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 81.5      |
|    loss           | 0.991     |
|    neglogp        | 0.991     |
|    prob_true_act  | 0.458     |
|    samples_so_far | 48032     |
---------------------------------


1794batch [00:04, 365.72batch/s]
1816batch [00:05, 359.80batch/s][A


0 Timesteps: -442.88972805000003
Eval num_timesteps=3750, episode_reward=-139.11 +/- 27.72
Episode length: 979.55 +/- 89.14
New best mean reward!
Eval num_timesteps=7500, episode_reward=-131.15 +/- 80.79
Episode length: 412.70 +/- 101.32
New best mean reward!
Eval num_timesteps=11250, episode_reward=-162.83 +/- 63.71
Episode length: 301.40 +/- 105.31
Eval num_timesteps=15000, episode_reward=-224.83 +/- 116.57
Episode length: 598.50 +/- 287.80
Eval num_timesteps=18750, episode_reward=-82.74 +/- 33.46
Episode length: 772.55 +/- 348.63
New best mean reward!
Eval num_timesteps=22500, episode_reward=-58.80 +/- 25.14
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=26250, episode_reward=-85.14 +/- 32.95
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=30000, episode_reward=-112.42 +/- 41.37
Episode length: 972.05 +/- 121.83
Eval num_timesteps=33750, episode_reward=-71.64 +/- 18.82
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=37500, episode_reward=-42.86 +/-

0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00139 |
|    entropy        | 1.39     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 40.5     |
|    loss           | 1.39     |
|    neglogp        | 1.39     |
|    prob_true_act  | 0.25     |
|    samples_so_far | 32       |
--------------------------------


489batch [00:01, 374.69batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500       |
|    ent_loss       | -0.000901 |
|    entropy        | 0.901     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 54.8      |
|    loss           | 0.857     |
|    neglogp        | 0.858     |
|    prob_true_act  | 0.479     |
|    samples_so_far | 16032     |
---------------------------------


900batch [00:02, 366.83batch/s]
975batch [00:02, 366.52batch/s][A

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1000      |
|    ent_loss       | -0.000823 |
|    entropy        | 0.823     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 65.6      |
|    loss           | 0.861     |
|    neglogp        | 0.862     |
|    prob_true_act  | 0.526     |
|    samples_so_far | 32032     |
---------------------------------


1499batch [00:04, 364.16batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1500      |
|    ent_loss       | -0.000842 |
|    entropy        | 0.842     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 77.8      |
|    loss           | 0.669     |
|    neglogp        | 0.67      |
|    prob_true_act  | 0.562     |
|    samples_so_far | 48032     |
---------------------------------


1801batch [00:04, 372.44batch/s]
1838batch [00:05, 367.06batch/s][A


0 Timesteps: -235.84904410000004
Eval num_timesteps=3750, episode_reward=-113.08 +/- 34.58
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=7500, episode_reward=-180.02 +/- 32.24
Episode length: 586.20 +/- 130.82
Eval num_timesteps=11250, episode_reward=-137.13 +/- 43.02
Episode length: 544.45 +/- 179.57
Eval num_timesteps=15000, episode_reward=-29.45 +/- 22.50
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=18750, episode_reward=-36.62 +/- 25.56
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=22500, episode_reward=-59.38 +/- 24.84
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=26250, episode_reward=-51.66 +/- 24.47
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=30000, episode_reward=-69.98 +/- 19.77
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=33750, episode_reward=-34.16 +/- 64.67
Episode length: 972.85 +/- 118.34
Eval num_timesteps=37500, episode_reward=-23.10 +/- 25.36
Episode length: 1000.00 +/- 0.00
New best 

0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00139 |
|    entropy        | 1.39     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 40.5     |
|    loss           | 1.38     |
|    neglogp        | 1.39     |
|    prob_true_act  | 0.25     |
|    samples_so_far | 32       |
--------------------------------


491batch [00:01, 367.05batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 500      |
|    ent_loss       | -0.001   |
|    entropy        | 1        |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 54.1     |
|    loss           | 0.891    |
|    neglogp        | 0.892    |
|    prob_true_act  | 0.46     |
|    samples_so_far | 16032    |
--------------------------------


936batch [00:02, 367.49batch/s]
973batch [00:02, 361.54batch/s][A

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1000      |
|    ent_loss       | -0.000942 |
|    entropy        | 0.942     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 66.8      |
|    loss           | 0.803     |
|    neglogp        | 0.804     |
|    prob_true_act  | 0.499     |
|    samples_so_far | 32032     |
---------------------------------


1499batch [00:04, 375.09batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1500      |
|    ent_loss       | -0.000842 |
|    entropy        | 0.842     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 83.4      |
|    loss           | 0.679     |
|    neglogp        | 0.679     |
|    prob_true_act  | 0.561     |
|    samples_so_far | 48032     |
---------------------------------


1916batch [00:05, 373.93batch/s]
1930batch [00:05, 361.38batch/s][A


0 Timesteps: -276.1607797
Eval num_timesteps=3750, episode_reward=-101.85 +/- 23.69
Episode length: 967.65 +/- 141.01
New best mean reward!
Eval num_timesteps=7500, episode_reward=-145.27 +/- 67.00
Episode length: 580.15 +/- 232.24
Eval num_timesteps=11250, episode_reward=-138.03 +/- 52.64
Episode length: 379.45 +/- 171.47
Eval num_timesteps=15000, episode_reward=-124.91 +/- 71.03
Episode length: 527.00 +/- 221.28
Eval num_timesteps=18750, episode_reward=-54.66 +/- 37.16
Episode length: 958.00 +/- 183.07
New best mean reward!
Eval num_timesteps=22500, episode_reward=-14.10 +/- 18.40
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=26250, episode_reward=-13.84 +/- 33.38
Episode length: 958.90 +/- 179.15
New best mean reward!
Eval num_timesteps=30000, episode_reward=-0.90 +/- 19.55
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=33750, episode_reward=-4.37 +/- 16.60
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=37500, episode_rewar

0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00139 |
|    entropy        | 1.39     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 40.5     |
|    loss           | 1.39     |
|    neglogp        | 1.39     |
|    prob_true_act  | 0.249    |
|    samples_so_far | 32       |
--------------------------------


496batch [00:01, 368.12batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500       |
|    ent_loss       | -0.000925 |
|    entropy        | 0.925     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 54.1      |
|    loss           | 0.759     |
|    neglogp        | 0.76      |
|    prob_true_act  | 0.517     |
|    samples_so_far | 16032     |
---------------------------------


872batch [00:02, 370.13batch/s]
986batch [00:02, 368.54batch/s][A

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 1000     |
|    ent_loss       | -0.00088 |
|    entropy        | 0.88     |
|    epoch          | 1        |
|    l2_loss        | 0        |
|    l2_norm        | 66.2     |
|    loss           | 0.728    |
|    neglogp        | 0.729    |
|    prob_true_act  | 0.541    |
|    samples_so_far | 32032    |
--------------------------------


1479batch [00:04, 371.63batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1500      |
|    ent_loss       | -0.000804 |
|    entropy        | 0.804     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 79.3      |
|    loss           | 0.82      |
|    neglogp        | 0.821     |
|    prob_true_act  | 0.554     |
|    samples_so_far | 48032     |
---------------------------------


1783batch [00:04, 374.95batch/s]
1798batch [00:04, 364.85batch/s][A


0 Timesteps: -184.1612621
Eval num_timesteps=3750, episode_reward=-237.36 +/- 28.08
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=7500, episode_reward=-195.58 +/- 81.22
Episode length: 687.40 +/- 219.06
New best mean reward!
Eval num_timesteps=11250, episode_reward=-160.61 +/- 44.93
Episode length: 309.30 +/- 52.82
New best mean reward!
Eval num_timesteps=15000, episode_reward=-93.26 +/- 26.33
Episode length: 970.80 +/- 127.28
New best mean reward!
Eval num_timesteps=18750, episode_reward=-29.77 +/- 18.04
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=22500, episode_reward=-29.08 +/- 20.64
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=26250, episode_reward=-28.18 +/- 25.92
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=30000, episode_reward=-14.44 +/- 20.04
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=33750, episode_reward=-17.28 +/- 18.90
Episode l

0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00139 |
|    entropy        | 1.39     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 40.5     |
|    loss           | 1.39     |
|    neglogp        | 1.39     |
|    prob_true_act  | 0.25     |
|    samples_so_far | 32       |
--------------------------------


467batch [00:01, 364.46batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500       |
|    ent_loss       | -0.000934 |
|    entropy        | 0.934     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 54.5      |
|    loss           | 0.995     |
|    neglogp        | 0.996     |
|    prob_true_act  | 0.464     |
|    samples_so_far | 16032     |
---------------------------------


912batch [00:02, 367.88batch/s]
987batch [00:02, 365.29batch/s][A

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1000      |
|    ent_loss       | -0.000856 |
|    entropy        | 0.856     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 68.5      |
|    loss           | 0.719     |
|    neglogp        | 0.72      |
|    prob_true_act  | 0.564     |
|    samples_so_far | 32032     |
---------------------------------


1472batch [00:04, 368.91batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1500      |
|    ent_loss       | -0.000755 |
|    entropy        | 0.755     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 85.6      |
|    loss           | 0.761     |
|    neglogp        | 0.761     |
|    prob_true_act  | 0.569     |
|    samples_so_far | 48032     |
---------------------------------


1847batch [00:05, 371.33batch/s]
1862batch [00:05, 360.25batch/s][A


0 Timesteps: -263.1966156
Eval num_timesteps=3750, episode_reward=-191.05 +/- 240.30
Episode length: 876.50 +/- 214.85
New best mean reward!
Eval num_timesteps=7500, episode_reward=-40.31 +/- 148.23
Episode length: 466.05 +/- 251.51
New best mean reward!
Eval num_timesteps=11250, episode_reward=-71.97 +/- 91.09
Episode length: 284.40 +/- 131.35
Eval num_timesteps=15000, episode_reward=-189.40 +/- 70.49
Episode length: 234.65 +/- 73.00
Eval num_timesteps=18750, episode_reward=-169.50 +/- 24.64
Episode length: 370.45 +/- 78.59
Eval num_timesteps=22500, episode_reward=-82.25 +/- 23.42
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=26250, episode_reward=-136.09 +/- 132.19
Episode length: 876.55 +/- 293.91
Eval num_timesteps=30000, episode_reward=-33.85 +/- 26.37
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=33750, episode_reward=-6.90 +/- 18.47
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=37500, episode_reward=-10.77 +/- 19.12
E

0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00139 |
|    entropy        | 1.39     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 40.5     |
|    loss           | 1.38     |
|    neglogp        | 1.39     |
|    prob_true_act  | 0.25     |
|    samples_so_far | 32       |
--------------------------------


468batch [00:01, 369.73batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500       |
|    ent_loss       | -0.000866 |
|    entropy        | 0.866     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 53.8      |
|    loss           | 0.688     |
|    neglogp        | 0.689     |
|    prob_true_act  | 0.553     |
|    samples_so_far | 16032     |
---------------------------------


996batch [00:02, 371.03batch/s]
Epoch 0 of 2                   [A

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 1000     |
|    ent_loss       | -0.00082 |
|    entropy        | 0.82     |
|    epoch          | 1        |
|    l2_loss        | 0        |
|    l2_norm        | 65       |
|    loss           | 0.718    |
|    neglogp        | 0.719    |
|    prob_true_act  | 0.565    |
|    samples_so_far | 32032    |
--------------------------------


1488batch [00:04, 371.88batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 1500     |
|    ent_loss       | -0.00086 |
|    entropy        | 0.86     |
|    epoch          | 1        |
|    l2_loss        | 0        |
|    l2_norm        | 79.1     |
|    loss           | 0.843    |
|    neglogp        | 0.844    |
|    prob_true_act  | 0.507    |
|    samples_so_far | 48032    |
--------------------------------


1982batch [00:05, 374.56batch/s]
1998batch [00:05, 366.26batch/s][A


0 Timesteps: -214.58609955000003
Eval num_timesteps=3750, episode_reward=-261.35 +/- 73.35
Episode length: 930.05 +/- 210.34
New best mean reward!
Eval num_timesteps=7500, episode_reward=-148.70 +/- 37.61
Episode length: 407.55 +/- 121.13
New best mean reward!
Eval num_timesteps=11250, episode_reward=-109.46 +/- 33.87
Episode length: 617.50 +/- 297.98
New best mean reward!
Eval num_timesteps=15000, episode_reward=-155.29 +/- 52.06
Episode length: 517.35 +/- 307.75
Eval num_timesteps=18750, episode_reward=-198.68 +/- 56.67
Episode length: 669.05 +/- 252.08
Eval num_timesteps=22500, episode_reward=-57.56 +/- 22.69
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=26250, episode_reward=-58.34 +/- 19.98
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=30000, episode_reward=-67.94 +/- 29.30
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=33750, episode_reward=-66.51 +/- 20.68
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=37500, episode_reward=-44.14 +/- 

0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00139 |
|    entropy        | 1.39     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 40.5     |
|    loss           | 1.38     |
|    neglogp        | 1.39     |
|    prob_true_act  | 0.25     |
|    samples_so_far | 32       |
--------------------------------


480batch [00:01, 362.34batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500       |
|    ent_loss       | -0.000929 |
|    entropy        | 0.929     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 54.7      |
|    loss           | 0.906     |
|    neglogp        | 0.907     |
|    prob_true_act  | 0.46      |
|    samples_so_far | 16032     |
---------------------------------


955batch [00:02, 361.80batch/s]
992batch [00:02, 357.38batch/s][A

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1000      |
|    ent_loss       | -0.000779 |
|    entropy        | 0.779     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 66.2      |
|    loss           | 0.734     |
|    neglogp        | 0.735     |
|    prob_true_act  | 0.567     |
|    samples_so_far | 32032     |
---------------------------------


1473batch [00:04, 365.67batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1500      |
|    ent_loss       | -0.000809 |
|    entropy        | 0.809     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 79.7      |
|    loss           | 0.913     |
|    neglogp        | 0.914     |
|    prob_true_act  | 0.507     |
|    samples_so_far | 48032     |
---------------------------------


1880batch [00:05, 361.61batch/s]
1910batch [00:05, 355.55batch/s][A


0 Timesteps: -283.29683975
Eval num_timesteps=3750, episode_reward=-90.20 +/- 24.58
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=7500, episode_reward=-18.50 +/- 32.81
Episode length: 322.35 +/- 182.76
New best mean reward!
Eval num_timesteps=11250, episode_reward=22.00 +/- 136.11
Episode length: 290.80 +/- 78.51
New best mean reward!
Eval num_timesteps=15000, episode_reward=-90.12 +/- 25.17
Episode length: 456.05 +/- 259.89
Eval num_timesteps=18750, episode_reward=-83.79 +/- 31.52
Episode length: 827.80 +/- 344.45
Eval num_timesteps=22500, episode_reward=-126.97 +/- 19.06
Episode length: 377.70 +/- 126.77
Eval num_timesteps=26250, episode_reward=-153.53 +/- 35.56
Episode length: 485.40 +/- 120.64
Eval num_timesteps=30000, episode_reward=-152.07 +/- 47.42
Episode length: 969.05 +/- 63.36
Eval num_timesteps=33750, episode_reward=-59.94 +/- 23.02
Episode length: 870.25 +/- 308.87
Eval num_timesteps=37500, episode_reward=-46.88 +/- 24.62
Episode length: 876.20 

0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00139 |
|    entropy        | 1.39     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 40.5     |
|    loss           | 1.38     |
|    neglogp        | 1.39     |
|    prob_true_act  | 0.25     |
|    samples_so_far | 32       |
--------------------------------


464batch [00:01, 362.07batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500       |
|    ent_loss       | -0.000896 |
|    entropy        | 0.896     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 53.5      |
|    loss           | 0.746     |
|    neglogp        | 0.747     |
|    prob_true_act  | 0.539     |
|    samples_so_far | 16032     |
---------------------------------


874batch [00:02, 368.91batch/s]
985batch [00:02, 365.92batch/s][A

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1000      |
|    ent_loss       | -0.000901 |
|    entropy        | 0.901     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 67.3      |
|    loss           | 0.797     |
|    neglogp        | 0.797     |
|    prob_true_act  | 0.525     |
|    samples_so_far | 32032     |
---------------------------------


1468batch [00:04, 368.22batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1500      |
|    ent_loss       | -0.000806 |
|    entropy        | 0.806     |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 82.9      |
|    loss           | 0.736     |
|    neglogp        | 0.736     |
|    prob_true_act  | 0.547     |
|    samples_so_far | 48032     |
---------------------------------


1766batch [00:04, 368.71batch/s]
1794batch [00:04, 362.83batch/s][A


0 Timesteps: -280.685424
Eval num_timesteps=3750, episode_reward=-201.17 +/- 32.41
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=7500, episode_reward=-123.88 +/- 78.31
Episode length: 386.30 +/- 150.24
New best mean reward!
Eval num_timesteps=11250, episode_reward=-193.07 +/- 72.92
Episode length: 324.95 +/- 150.75
Eval num_timesteps=15000, episode_reward=-187.12 +/- 35.38
Episode length: 339.55 +/- 112.22
Eval num_timesteps=18750, episode_reward=-156.22 +/- 63.14
Episode length: 566.20 +/- 265.76
Eval num_timesteps=22500, episode_reward=-53.20 +/- 30.54
Episode length: 990.55 +/- 41.19
New best mean reward!
Eval num_timesteps=26250, episode_reward=-46.58 +/- 19.76
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=30000, episode_reward=-52.96 +/- 17.08
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=33750, episode_reward=-137.89 +/- 50.14
Episode length: 549.65 +/- 251.62
Eval num_timesteps=37500, episode_reward=-100.38 +/- 47.76


In [None]:
from parameter_initialization_deep_rl.common.evaluate import (
    create_numpy_arr_from_logs,
    create_sample,
    mean_confidence_intervals
)
from parameter_initialization_deep_rl.common.helpers import plot_performance

log_dirs = [f"{log_dir}/{seed}/evaluations.npz" for seed in seeds]
avg_returns, n = create_numpy_arr_from_logs(log_dirs)

sample = create_sample(avg_returns)

# Save the performance score for the individual seeds
np.savez(f"{log_dir}/avg_perf_across_seeds",
            sample=sample)

performance_score = np.average(sample)

# Save the performance score, i.e., the averaged return across all seeds and evaluation periods, i.e., just one single number
np.savez(f"{log_dir}/perf_score",
            perf_score=performance_score)

AVG, H = mean_confidence_intervals(avg_returns, n)

reward_before_training = reward_sum / len(seeds)
AVG = np.append(reward_before_training, AVG)

# Save the average return and confidence intervals for all the individual evaluation trials averaged across all seeds
np.savez(f"{log_dir}/avg_perf_eval_trials", avg=AVG,
            h=H)

plot_performance(
    title="Performance",
    graph_label="Random",
    x = np.arange(0,62*16384,16364),
    y=AVG,
    x_label="timesteps",
    y_label="Return"
)

In [None]:
print(AVG)