In [6]:
# Imports
from stable_baselines3 import PPO
import stable_baselines3.ppo.policies as ppo
from stable_baselines3.common.env_util import make_vec_env
import gym
import numpy as np
import random 
from gym.envs.box2d.lunar_lander import heuristic as lunar_heuristic
from stable_baselines3.common.monitor import Monitor
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from stable_baselines3.common.vec_env import DummyVecEnv
from imitation.algorithms import bc
from stable_baselines3.common.evaluation import evaluate_policy
from parameter_initialization_deep_rl.common.helpers import create_folder
from stable_baselines3.common.callbacks import EvalCallback
from torch.nn.modules.activation import Sigmoid, Tanh, ReLU, LeakyReLU

In [7]:
def heuristic_naive(obs: np.ndarray):
    obs = obs[0]
    action = 0
    if obs[1] < 0.3:
        if obs[3] < -0.05:
            action = 2
        else:
            action = 0
    else:
        a = random.random()
        if a < 0.4 and obs[3] < 0:
            action = 2
        else:
            if obs[4] < -0.1:
                action = 1
            elif obs[4] > 0.1:
                action = 3
    action = np.full((1),action)
    return action

def heuristic_expert(o: np.ndarray):
    # o is single observation and of shape (1,8)
    # action a has to be of shape (1)
    a = np.full((1),lunar_heuristic(gym.make("LunarLander-v2"), o[0]))
    return a

In [8]:
seeds = [95,  39,   1,  38,  83,  38,  36,  50,  14, 100,  67,  34,  68, 47,  70,  57,  64,  98,  93,  25]

In [9]:
log_dir = "./logs/ppo/bc_expert"

In [10]:
def linear_schedule(initial_value):
    """
    Linear learning rate schedule.
    :param initial_value: (float or str)
    :return: (function)
    """
    if isinstance(initial_value, str):
        initial_value = float(initial_value)

    def func(progress):
        """
        Progress will decrease from 1 (beginning) to 0
        :param progress: (float)
        :return: (float)
        """
        return progress * initial_value

    return func

In [11]:
# Hyperparameter
hyperparameter = dict(
    n_steps = 16,
    gae_lambda = 0.98,
    gamma = 0.99,
    n_epochs = 4,
    ent_coef = 0.0,
)

In [None]:
reward_sum = 0

for seed in seeds:
    # create environments and set seed
    env_train = gym.make("LunarLander-v2")
    env_train.seed(seed)
    env_test = Monitor(gym.make("LunarLander-v2"))
    env_test.seed(seed)
    
    # Collect rollouts using the expert
    rollouts = rollout.rollout(
        heuristic_expert,
        DummyVecEnv([lambda: RolloutInfoWrapper(env_train)]),
        rollout.make_sample_until(min_timesteps=None, min_episodes=200),
    )
    # Flatten the trajectories to obtain individual transitions
    transitions = rollout.flatten_trajectories(rollouts)
    # Set up student
    student = PPO(
        policy=ppo.MlpPolicy,
        env=make_vec_env(env_id="LunarLander-v2", n_envs=8, seed=seed),
        gamma = 0.995,
        n_steps = 5,
        learning_rate = linear_schedule(0.00083),
        ent_coef = 0.00001,
        seed=seed,
        verbose=False,
    )
    # Set up behavior cloning agent
    bc_trainer = bc.BC(
        observation_space=env_train.observation_space,
        action_space=env_train.action_space,
        demonstrations=transitions,
        policy=student.policy,
    )
    # Pretrain the policy with behavior cloning
    bc_trainer.train(n_epochs=1)
    # Test performance before training
    reward_before_training, _ = evaluate_policy(student.policy, env_test, 20)
    reward_sum += reward_before_training
    print(f"0 Timesteps: {reward_before_training}")
    # Create log folder
    trial_log_dir = create_folder(f"{log_dir}/{seed}")
    # Create eval callback
    eval_callback = EvalCallback(
        env_test,
        log_path=trial_log_dir,
        n_eval_episodes=20,
        eval_freq=3125,
        deterministic=True,
        render=False
    )
    # Train the pretrained policy using the regular learning algorithm
    student.learn(
        total_timesteps=1e6,
        callback=eval_callback
    )

We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=5 and n_envs=8)
0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00139 |
|    entropy        | 1.39     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 145      |
|    loss           | 1.38     |
|    neglogp        | 1.39     |
|    prob_true_act  | 0.25     |
|    samples_so_far | 32       |
--------------------------------


475batch [00:01, 333.30batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500       |
|    ent_loss       | -0.000659 |
|    entropy        | 0.659     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 184       |
|    loss           | 0.68      |
|    neglogp        | 0.681     |
|    prob_true_act  | 0.65      |
|    samples_so_far | 16032     |
---------------------------------


979batch [00:03, 234.33batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1000      |
|    ent_loss       | -0.000434 |
|    entropy        | 0.434     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 216       |
|    loss           | 0.381     |
|    neglogp        | 0.382     |
|    prob_true_act  | 0.777     |
|    samples_so_far | 32032     |
---------------------------------


1479batch [00:05, 229.05batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1500      |
|    ent_loss       | -0.000496 |
|    entropy        | 0.496     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 240       |
|    loss           | 0.711     |
|    neglogp        | 0.711     |
|    prob_true_act  | 0.638     |
|    samples_so_far | 48032     |
---------------------------------


1548batch [00:05, 227.84batch/s]
1557batch [00:05, 262.38batch/s][A


0 Timesteps: 203.70084575
Eval num_timesteps=25000, episode_reward=-98.93 +/- 67.77
Episode length: 411.05 +/- 77.91
New best mean reward!
Eval num_timesteps=50000, episode_reward=-38.42 +/- 113.07
Episode length: 788.30 +/- 307.76
New best mean reward!
Eval num_timesteps=75000, episode_reward=26.15 +/- 138.79
Episode length: 444.45 +/- 358.02
New best mean reward!
Eval num_timesteps=100000, episode_reward=71.57 +/- 116.56
Episode length: 271.40 +/- 239.14
New best mean reward!
Eval num_timesteps=125000, episode_reward=-13.89 +/- 106.45
Episode length: 746.55 +/- 389.03
Eval num_timesteps=150000, episode_reward=126.59 +/- 103.07
Episode length: 273.80 +/- 171.36
New best mean reward!
Eval num_timesteps=175000, episode_reward=137.94 +/- 119.79
Episode length: 328.30 +/- 205.03
New best mean reward!
Eval num_timesteps=200000, episode_reward=39.05 +/- 110.31
Episode length: 697.10 +/- 328.05
Eval num_timesteps=225000, episode_reward=78.00 +/- 158.15
Episode length: 474.40 +/- 299.09
Eval 

0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00139 |
|    entropy        | 1.39     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 145      |
|    loss           | 1.38     |
|    neglogp        | 1.38     |
|    prob_true_act  | 0.25     |
|    samples_so_far | 32       |
--------------------------------


486batch [00:01, 296.32batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 500      |
|    ent_loss       | -0.0005  |
|    entropy        | 0.5      |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 177      |
|    loss           | 0.606    |
|    neglogp        | 0.606    |
|    prob_true_act  | 0.695    |
|    samples_so_far | 16032    |
--------------------------------


979batch [00:03, 325.55batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1000      |
|    ent_loss       | -0.000621 |
|    entropy        | 0.621     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 203       |
|    loss           | 0.514     |
|    neglogp        | 0.515     |
|    prob_true_act  | 0.673     |
|    samples_so_far | 32032     |
---------------------------------


1471batch [00:04, 324.81batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1500      |
|    ent_loss       | -0.000353 |
|    entropy        | 0.353     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 224       |
|    loss           | 0.169     |
|    neglogp        | 0.169     |
|    prob_true_act  | 0.863     |
|    samples_so_far | 48032     |
---------------------------------


1570batch [00:05, 323.78batch/s]
1581batch [00:05, 301.88batch/s][A


0 Timesteps: 162.9970681
Eval num_timesteps=25000, episode_reward=44.05 +/- 177.52
Episode length: 478.35 +/- 298.38
New best mean reward!
Eval num_timesteps=50000, episode_reward=-46.17 +/- 90.04
Episode length: 279.30 +/- 111.42
Eval num_timesteps=75000, episode_reward=45.82 +/- 180.06
Episode length: 490.95 +/- 316.20
New best mean reward!
Eval num_timesteps=100000, episode_reward=92.64 +/- 115.04
Episode length: 174.65 +/- 76.63
New best mean reward!
Eval num_timesteps=125000, episode_reward=57.09 +/- 103.24
Episode length: 345.45 +/- 335.89
Eval num_timesteps=150000, episode_reward=41.86 +/- 111.84
Episode length: 186.95 +/- 187.72
Eval num_timesteps=175000, episode_reward=117.59 +/- 89.23
Episode length: 505.10 +/- 372.13
New best mean reward!
Eval num_timesteps=200000, episode_reward=78.61 +/- 126.10
Episode length: 384.10 +/- 380.07
Eval num_timesteps=225000, episode_reward=85.82 +/- 156.37
Episode length: 303.10 +/- 249.57
Eval num_timesteps=250000, episode_reward=132.55 +/- 1

In [None]:
from parameter_initialization_deep_rl.common.evaluate import (
    create_numpy_arr_from_logs,
    create_sample,
    mean_confidence_intervals
)
from parameter_initialization_deep_rl.common.helpers import plot_performance

log_dirs = [f"{log_dir}/{seed}/evaluations.npz" for seed in seeds]
avg_returns, n = create_numpy_arr_from_logs(log_dirs)

sample = create_sample(avg_returns)

# Save the performance score for the individual seeds
np.savez(f"{log_dir}/avg_perf_across_seeds",
            sample=sample)

performance_score = np.average(sample)

# Save the performance score, i.e., the averaged return across all seeds and evaluation periods, i.e., just one single number
np.savez(f"{log_dir}/perf_score",
            perf_score=performance_score)

AVG, H = mean_confidence_intervals(avg_returns, n)

reward_before_training = reward_sum / len(seeds)
AVG = np.append(reward_before_training, AVG)

# Save the average return and confidence intervals for all the individual evaluation trials averaged across all seeds
np.savez(f"{log_dir}/avg_perf_eval_trials", avg=AVG,
            h=H)

plot_performance(
    title="Performance",
    graph_label="Random",
    x = np.arange(0,62*16384,16364),
    y=AVG,
    x_label="timesteps",
    y_label="Return"
)

In [None]:
print(AVG)