In [1]:
!apt-get install -y swig
!pip install box2d-py gymnasium[box2d]




The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 38 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 1s (843 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 128663 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubuntu1_all.deb ...
Unpacking swig (4.0.2-1ubuntu1) ...
Setting up swig4.0 (4

In [2]:
!pip install stable-baselines3
!pip install gymnasium
!pip install imageio
!pip install pygame

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13->stable-baselines3)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13->stable-baselines3)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13->stable-baselines3)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13->stable-baselines3)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13->stable-baselines3)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13->stable-baselines3)
  Downloading nvidia_cufft

In [3]:
import os
import argparse
from pathlib import Path
import warnings
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback, StopTrainingOnRewardThreshold
from gymnasium.wrappers import RecordVideo  

os.environ["XDG_RUNTIME_DIR"] = "/tmp"
warnings.filterwarnings("ignore")

def ensure_dir(path):
    Path(path).mkdir(parents=True, exist_ok=True)

def default_output_dir(save_dir):
    out_dir = Path(save_dir) / "lunarlander_ppo"
    ensure_dir(out_dir)
    return out_dir

def make_lunar_env(render_mode=None, seed=0):
    def _init():
        env = gym.make("LunarLander-v2", render_mode=render_mode)
        env.reset(seed=seed)
        return env
    return _init

def make_vec_envs(n_envs=1, seed=0):
    envs = DummyVecEnv([make_lunar_env(seed=i+seed) for i in range(n_envs)])
    envs = VecMonitor(envs)
    return envs

def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("--timesteps", type=int, default=10_000_000)
    p.add_argument("--save_dir", type=str, default="outputs")
    p.add_argument("--reward_threshold", type=float, default=250.0)
    p.add_argument("--render_video", action="store_true")
    return p.parse_args(args=["--render_video"])  

def main():
    args = parse_args()
    out_dir = default_output_dir(args.save_dir)
    log_dir = out_dir / "logs"
    ensure_dir(log_dir)

    train_env = make_vec_envs(1)
    eval_env = make_vec_envs(1, seed=42)

    model = PPO(
        "MlpPolicy",
        train_env,
        verbose=1,
        tensorboard_log=str(log_dir),
        device="cpu",
        n_steps=10000,     
        batch_size=512     
    )

    checkpoint_callback = CheckpointCallback(
        save_freq=100_000,
        save_path=str(out_dir / "checkpoints"),
        name_prefix="ppo_lunar"
    )

    stop_callback = StopTrainingOnRewardThreshold(
        reward_threshold=args.reward_threshold,
        verbose=1
    )

    eval_callback = EvalCallback(
        eval_env,
        callback_on_new_best=checkpoint_callback,
        best_model_save_path=str(out_dir / "best_model"),
        log_path=str(out_dir / "eval_logs"),
        eval_freq=100_000,
        n_eval_episodes=5,
        deterministic=True,
        render=False,
        callback_after_eval=stop_callback
    )

    model.learn(total_timesteps=args.timesteps, callback=[eval_callback])
    model.save(str(out_dir / "final_model.zip"))

    if args.render_video:
        video_path = out_dir / "video"
        ensure_dir(video_path)

        env = gym.make("LunarLander-v2", render_mode="rgb_array")
        env = RecordVideo(env, str(video_path), episode_trigger=lambda x: True)

        best_model_path = out_dir / "best_model" / "best_model.zip"
        model = PPO.load(str(best_model_path))

        obs, _ = env.reset(seed=0)
        done = False
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = env.step(action)  
            done = terminated or truncated

        env.close()

if __name__ == "__main__":
    main()

2025-08-30 11:19:20.295425: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756552760.489321      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756552760.544254      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using cpu device
Logging to outputs/lunarlander_ppo/logs/PPO_1
-----------------------------------
| rollout/           |            |
|    ep_len_mean     | 89.2       |
|    ep_rew_mean     | -186.90257 |
| time/              |            |
|    fps             | 1087       |
|    iterations      | 1          |
|    time_elapsed    | 9          |
|    total_timesteps | 10000      |
-----------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 98.3        |
|    ep_rew_mean          | -153.09262  |
| time/                   |             |
|    fps                  | 1037        |
|    iterations           | 2           |
|    time_elapsed         | 19          |
|    total_timesteps      | 20000       |
| train/                  |             |
|    approx_kl            | 0.006031879 |
|    clip_fraction        | 0.0268      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.

                                                               

Moviepy - Done !
Moviepy - video ready /kaggle/working/outputs/lunarlander_ppo/video/rl-video-episode-0.mp4
