In [2]:
# Import libraires
import gymnasium as gym
from huggingface_sb3 import load_from_hub, package_to_hub
from huggingface_hub import (
    notebook_login,
)

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [3]:
# Create environment
env = gym.make('LunarLander-v2')

# Create agent
model = PPO(policy='MlpPolicy', 
            env=env, 
            n_steps=1024,
            n_epochs=4,
            gamma=.999,
            gae_lambda=0.98,
            batch_size=32,
            ent_coef=0.01,
            verbose=1,
            learning_rate=0.0001
        )

# train agent
model.learn(total_timesteps=10000000)

# Save trained model
model_name = "ppo-LunarLander-v2"
model.save(model_name)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 88.7     |
|    ep_rew_mean     | -179     |
| time/              |          |
|    fps             | 2566     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 1024     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 95.1          |
|    ep_rew_mean          | -184          |
| time/                   |               |
|    fps                  | 1204          |
|    iterations           | 2             |
|    time_elapsed         | 1             |
|    total_timesteps      | 2048          |
| train/                  |               |
|    approx_kl            | 0.00035014958 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2       

KeyboardInterrupt: 

In [26]:
# Save trained model
model_name = "ppo-LunarLander-v2"
model.save(model_name)

In [4]:
# Load trained model
model = PPO.load("ppo-LunarLander-v2")

In [5]:
# Evaluate and see results
eval_env = Monitor(gym.make('LunarLander-v2', render_mode="human"))
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=5, deterministic=True)
print(f"mean_reward={mean_reward:.2f}+/-{std_reward}")
eval_env.close()

mean_reward=236.31+/-93.6811385555405


In [None]:
package_to_hub(model = model,
               model_name="Belwen/ppo-LunarLander-v2",
               model_architecture="PPO",
               env_id="LunarLander-v2",
               eval_env=DummyVecEnv([lambda: Monitor(gym.make("LunarLander-v2", render_mode="rgb_array"))]),
               repo_id="Belwen/ppo-LunarLander-v2",
               commit_message="Upload trained ppo LunarLander")