In [1]:
import gym
from stable_baselines3 import SAC, DQN, PPO, TD3, A2C
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.noise import NormalActionNoise
from parameter_initialization_deep_rl.models.sac.policies import SACPolicy
from parameter_initialization_deep_rl.models.dqn.policies import DQNPolicy
from parameter_initialization_deep_rl.models.td3.policies import TD3Policy
from parameter_initialization_deep_rl.common.policies import ActorCriticPolicy
from parameter_initialization_deep_rl.common.evaluate import run_multiple_trials
from parameter_initialization_deep_rl.common.helpers import configure_policy, linear_schedule
from torch.nn.modules.activation import Sigmoid, Tanh, ReLU, LeakyReLU
import numpy as np

In [2]:
# Method that returns the index of the configuration where the experiments need to be resumed
def get_last_index(policy_kwargs, config):
    for i in range(len(policy_kwargs)):
        if policy_kwargs[i] == config:
            return i     

In [3]:
# Get all configurations of (weight_init, activation_fn, bias_init) tuples tested in experiment as a list of dictionaries
policy_kwargs = configure_policy(include_bias=False)
print(len(policy_kwargs))

32


In [4]:
# The configuration with which the experiments were left off the last time (due to crash, stopping, etc.)
last_config = dict(
    weight_init = "kaiming_normal",
    bias_init = "random_uniform",
    activation_fn = ReLU,
)

In [5]:
# Hyperparameter that are used for the specific algorithm in that particular environment
hyperparameter = dict(
    learning_rate = 7.3e-4,
    buffer_size = 300000,
    batch_size = 256,
    ent_coef = 'auto',
    gamma = 0.98,
    tau = 0.02,
    train_freq = 64,
    gradient_steps = 64,
    learning_starts = 10000,
    use_sde = True
)

In [None]:
i = get_last_index(policy_kwargs, last_config) + 1
# Train the configuration using the specified algorithm in the particular environment
while i < len(policy_kwargs):
    config = policy_kwargs[i]
    # Add network architecture to config
    config["net_arch"] = [400,300]
    # Set the initial log std (specific to SAC)
    config["log_std_init"] = -3
    i += 1
    # Run the experiment for multiple runs using our own standard policies
    log_data = run_multiple_trials(
        env_name = "BipedalWalker-v3",
        n_envs = 1,
        algorithm=SAC,
        policy=SACPolicy,
        hyperparameter=hyperparameter,
        policy_kwargs=config,
        num_trials=5,
        train_timesteps=2e5,
        n_trial_rounds=10,
        verbose=False
    )
    print(log_data)