---
## Initialization
---

In [8]:
import gymnasium as gym

import numpy as np

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder


---
## PPO Implementation
---

In [9]:
#creates the environment
env = make_vec_env("LunarLander-v2", n_envs = 16)

#implements PPO
model = PPO(policy = "MlpPolicy", env = env, n_steps =1024, batch_size = 64, n_epochs = 4, gamma = 0.999,
            gae_lambda = 0.98, ent_coef = 0.01, verbose = 1)

Using cuda device


In [10]:
# trains the model 1000000 timesteps
model.learn(total_timesteps = 1000000)

model.save('LunarLanderPPO')

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 90.5     |
|    ep_rew_mean     | -174     |
| time/              |          |
|    fps             | 3921     |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 16384    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 88.8         |
|    ep_rew_mean          | -139         |
| time/                   |              |
|    fps                  | 2323         |
|    iterations           | 2            |
|    time_elapsed         | 14           |
|    total_timesteps      | 32768        |
| train/                  |              |
|    approx_kl            | 0.0076965694 |
|    clip_fraction        | 0.0633       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | -1.79e-06    |
|    learning_r

---
Evaluation
---

In [11]:
evalEnv = Monitor(gym.make("LunarLander-v2"))

meanReward, stdReward = evaluate_policy(model, evalEnv, n_eval_episodes = 10, deterministic=True)

print(f"mean reward= {meanReward:.2f} +/- {stdReward}")

mean reward= 250.32 +/- 18.524214364704232


---
Recording
---

In [12]:
#loading in trained model
model = PPO.load('LunarLanderPPO')

#set up for video
env = DummyVecEnv([lambda: gym.make("LunarLander-v2", render_mode="rgb_array")])

env = VecVideoRecorder(env, 'videos/', record_video_trigger=lambda x: x == 0, video_length = 1000, name_prefix="lunarlanderPPO")

# reseting the environment and start recording
obs = env.reset()

for i in range(1000):

    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = env.step(action)

env.close()

Saving video to /content/videos/lunarlanderPPO-step-0-to-step-1000.mp4
Moviepy - Building video /content/videos/lunarlanderPPO-step-0-to-step-1000.mp4.
Moviepy - Writing video /content/videos/lunarlanderPPO-step-0-to-step-1000.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /content/videos/lunarlanderPPO-step-0-to-step-1000.mp4


