In [8]:
import os
import gymnasium as gym

from stable_baselines3 import PPO, A2C, HER, SAC, TD3
from stable_baselines3.common.env_util import make_vec_env

path = os.getcwd()
savepath = os.path.join(path, os.pardir)
num_episodes = 200_000
episode_steps = 100

In [None]:
# PPO
env = gym.make("Pusher-v4", render_mode='rgb_array', max_episode_steps=episode_steps)
vec_env = make_vec_env(lambda:env, n_envs=1)

params={'n_steps': 483, 'gamma': 0.9733319984142623, 'learning_rate': 6.249837257325398e-05, 'ent_coef': 6.2177665117053e-05, 'clip_range': 0.20674514787972498, 'n_epochs': 18, 'gae_lambda': 0.9180827852846369, 'max_grad_norm': 0.7932345709898225, 'vf_coef': 0.5888144289366819}

model_ppo = PPO("MlpPolicy", vec_env, 
            verbose=1,
            batch_size=32,
            n_steps=params['n_steps'],
            gamma=params['gamma'],
            learning_rate=params['learning_rate'],
            ent_coef=params['ent_coef'],
            clip_range=params['clip_range'],
            n_epochs=params['n_epochs'],
            gae_lambda=params['gae_lambda'],
            max_grad_norm=params['max_grad_norm'],
            vf_coef=params['vf_coef'],
            tensorboard_log="./ppo_tensorboard_log/")

model_ppo.learn(total_timesteps=200_000,log_interval=10,progress_bar=True)
model_ppo.save(f"{savepath}/models/ppo_{num_episodes/1000}K_{episode_steps}ep")

### Models to test:

- A2C
- HER
- SAC
- TD3

If time:
- TRPO
- RecurrentPPO
- ARS
- TQC

In [None]:
# A2C
env = gym.make("Pusher-v4", render_mode='rgb_array', max_episode_steps=episode_steps)
vec_env = make_vec_env(lambda:env, n_envs=1)

model_a2c = A2C("MlpPolicy", vec_env, tensorboard_log="./ppo_tensorboard_log/")
model_a2c.learn(num_episodes, log_interval=10, progress_bar=True)
model_a2c.save(f"{savepath}/models/a2c_{num_episodes/1000}K_{episode_steps}ep")


In [None]:
# HER
env = gym.make("Pusher-v4", render_mode='rgb_array', max_episode_steps=episode_steps)
vec_env = make_vec_env(lambda:env, n_envs=1)

model_her = HER("MlpPolicy", vec_env, 
                model_class='tqc', 
                n_sampled_goal=4, 
                goal_selection_strategy="future", 
                buffer_size=1000000, 
                batch_size=2048, 
                gamma=0.95, 
                learning_rate=1e-3, 
                tau=0.05, 
                policy_kwargs=dict(n_critics=2, net_arch=[512, 512, 512]), 
                online_sampling=True, 
                verbose=1, 
                tensorboard_log="./ppo_tensorboard_log/")

model_her.learn(num_episodes, log_interval=10, progress_bar=True)
model_her.save(f"{savepath}/models/her_{num_episodes/1000}K_{episode_steps}ep")

In [None]:
# SAC
env = gym.make("Pusher-v4", render_mode='rgb_array', max_episode_steps=episode_steps)
vec_env = make_vec_env(lambda:env, n_envs=1)

model_sac = SAC("MlpPolicy", vec_env, tensorboard_log="./ppo_tensorboard_log/")
model_sac.learn(num_episodes, log_interval=10, progress_bar=True)
model_sac.save(f"{savepath}/models/sac_{num_episodes/1000}K_{episode_steps}ep")

In [None]:
# TD3
env = gym.make("Pusher-v4", render_mode='rgb_array', max_episode_steps=episode_steps)
vec_env = make_vec_env(lambda:env, n_envs=1)

model_td3 = TD3("MlpPolicy", vec_env, tensorboard_log="./ppo_tensorboard_log/")
model_td3.learn(num_episodes, log_interval=10, progress_bar=True)
model_td3.save(f"{savepath}/models/td3_{num_episodes/1000}K_{episode_steps}ep")


### Stable-baselines3 contrib library:

In [11]:
from sb3_contrib import TRPO, RecurrentPPO, ARS, TQC

In [None]:
# TRPO
env = gym.make("Pusher-v4", render_mode='rgb_array', max_episode_steps=episode_steps)
vec_env = make_vec_env(lambda:env, n_envs=1)

model_trpo = TRPO("MlpPolicy", vec_env, tensorboard_log="./ppo_tensorboard_log/")
model_trpo.learn(num_episodes, log_interval=10, progress_bar=True)
model_trpo.save(f"{savepath}/models/trpo_{num_episodes/1000}K_{episode_steps}ep")

In [None]:
# RecurrentPPO
env = gym.make("Pusher-v4", render_mode='rgb_array', max_episode_steps=episode_steps)
vec_env = make_vec_env(lambda:env, n_envs=1)

model_rppo = RecurrentPPO("MlpLstmPolicy", vec_env, 
            verbose=1,
            batch_size=32,
            n_steps=params['n_steps'],
            gamma=params['gamma'],
            learning_rate=params['learning_rate'],
            ent_coef=params['ent_coef'],
            clip_range=params['clip_range'],
            n_epochs=params['n_epochs'],
            gae_lambda=params['gae_lambda'],
            max_grad_norm=params['max_grad_norm'],
            vf_coef=params['vf_coef'],
            tensorboard_log="./ppo_tensorboard_log/")
model_rppo.learn(num_episodes, log_interval=10, progress_bar=True)
model_rppo.save(f"{savepath}/models/rppo_{num_episodes/1000}K_{episode_steps}ep")

In [None]:
# ARS
env = gym.make("Pusher-v4", render_mode='rgb_array', max_episode_steps=episode_steps)
vec_env = make_vec_env(lambda:env, n_envs=1)

model_ars = ARS("MlpPolicy", vec_env, tensorboard_log="./ppo_tensorboard_log/")
model_ars.learn(num_episodes, log_interval=10, progress_bar=True)
model_ars.save(f"{savepath}/models/ars_{num_episodes/1000}K_{episode_steps}ep")

In [None]:
# TQC
env = gym.make("Pusher-v4", render_mode='rgb_array', max_episode_steps=episode_steps)
vec_env = make_vec_env(lambda:env, n_envs=1)

model_tqc = TQC("MlpPolicy", vec_env, tensorboard_log="./ppo_tensorboard_log/")
model_tqc.learn(num_episodes, log_interval=10, progress_bar=True)
model_tqc.save(f"{savepath}/models/tqc_{num_episodes/1000}K_{episode_steps}ep")

## Test model:

In [None]:
seeds = [3559, 3216, 7890, 5242, 4924, 3588, 722, 8119]


for seed in seeds:
    vec_env.seed(seed)
    state = vec_env.reset()
    while True:
        action, _states = model.predict(state)
        state, rewards, dones, info = vec_env.step(action)
        vec_env.render("human")

        if dones:
            break