<a href="https://colab.research.google.com/github/Chickenlover32/RL_MyGitHub/blob/main/Reinforcement_Learning_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reinforcement Learning Coursework

### Training and testing a PPO agent in Lunar Lander

Swathi Suresh,
CID: 02208023

This notebook presents the implementation of a Proximal Policy Optimisation (PPO) agent in the LunarLander environment. It includes training, evaluation, and the generation of the required figures and visualisations.

*   PPO Implementation : https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/ppo/ppo.py
*   Optimised Hyperparameters: https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/ppo.yml

*   Tutorial for implementation: https://github.com/Stable-Baselines-Team/rl-colab-notebooks?tab=readme-ov-file
















### Setup and Imports

In [None]:
# Restart and remove variables
%reset -f

In [None]:
# Mount google drive to save data
from google.colab import drive
drive.mount("/content/drive")

In [None]:
# change directory on google drive
%cd /content/drive/MyDrive/RL

In [None]:
# Install required dependencies
!apt-get update
!apt-get install -y swig ffmpeg
!pip install gymnasium[box2d]
!pip install stable-baselines3
!pip install tensorboard

In [None]:
# Import libraries

import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import gymnasium as gym            # Enviornment API
from stable_baselines3 import PPO  # PPO algorithm

from stable_baselines3.common.logger import configure
from stable_baselines3.common.monitor import Monitor

from datetime import datetime
import tensorboard

import base64
from pathlib import Path
from IPython import display as ipythondisplay
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv
from stable_baselines3.common.callbacks import ProgressBarCallback

### Train Model

In [None]:
# Initialise directory to save logging info without overwrites

run_id = 1
log_dir = f"/content/drive/MyDrive/RL/LunarLanderLogs{run_id}"

os.makedirs(log_dir, exist_ok=True)

# Set up logger
new_logger = configure(log_dir, ["stdout", "csv", "tensorboard"])


In [None]:
# Create Lunar Lander Environment

env = gym.make('LunarLander-v3', render_mode='human') # opens live rendering window
env = Monitor(env, log_dir)                           # Save logs in log_dir/monitor.csv

# Instantiate agent
model = PPO(
    policy="MlpPolicy",
    env=env,
    learning_rate=3e-4,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    clip_range=0.2,
    clip_range_vf=None,
    normalize_advantage=True,
    ent_coef=0.01,
    vf_coef=0.5,
    max_grad_norm=0.5,
    use_sde=False,
    tensorboard_log=log_dir,
    verbose=2,
    device="auto"
)


In [None]:
# Train model

model.set_logger(new_logger)
model.learn(total_timesteps=1_000_000, callback=ProgressBarCallback())
model.save(os.path.join(log_dir, "PPO_lunarlanderv3"))

# env.close()


In [9]:
# Hard coded save
model.save("/content/drive/MyDrive/RL/LunarLanderLogs1/PPO_lunarlanderv3")

### Evaluate Model


In [None]:
# Initialise model for evaluation

# Path to trained model
log_dir = "/content/drive/MyDrive/RL/LunarLanderLogs1"    # CHANGE LUNAR LANDER LOGS IF NEEDED

# Create evaluation environment
env_Test = gym.make("LunarLander-v3", render_mode="rgb_array")
env_Test = Monitor(env_Test)

# Load PPO model
model = PPO.load(os.path.join(log_dir, "PPO_lunarlanderv3.zip"), env=env_Test) # CHANGE LUNAR LANDER LOGS IF NEEDED

In [None]:
# Load CSV file with logs

#file_path = f"{log_dir}/progress.csv"
file_path = "/content/drive/MyDrive/RL/LunarLanderLogs1/progress.csv"   # Hardcoded
data = pd.read_csv(file_path)

In [None]:
print(len(data.columns))

In [None]:
# 1) Plot Learning rate vs timesteps

# Check required columns exist
if ("rollout/ep_rew_mean" in data.columns) and ("time/total_timesteps" in data.columns):

    plt.figure(
        figsize=(10, 6),   # larger figure
        dpi=150            # higher resolution
    )

    plt.plot(
        data["time/total_timesteps"],
        data["rollout/ep_rew_mean"],
        color="blue",
        linewidth=2
    )

    plt.xlabel("Total Timesteps", fontsize=12)
    plt.ylabel("Average Mean Episode Reward", fontsize=12)
    plt.title("PPO on LunarLander: Mean Return vs Timesteps", fontsize=14)

    plt.grid(True, alpha=0.7)
    plt.tight_layout(pad=3)

    plt.show()

else:
    print("Required columns not found in progress.csv")


In [None]:
# 2) Distribution of rewards per episode

# Load episodic rewards from monitor.csv
monitor_path = os.path.join(log_dir, "monitor.csv")
monitor_data = pd.read_csv(monitor_path, skiprows=1)  # skip header row

# Episode rewards column is "r"
episode_rewards = monitor_data["r"]

# Plot distribution
plt.figure(figsize=(8, 5), dpi=150)
plt.hist(
    episode_rewards,
    bins=30,
    color="steelblue",
    edgecolor="black",
    alpha=0.8
)

plt.xlabel("Episode Reward")
plt.ylabel("Frequency")
plt.title("PPO on LunarLander: Distribution of Episode Rewards")
plt.grid(True, alpha=0.3)
plt.tight_layout(pad=3)
plt.show()


In [28]:
# Display setup to record video
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

In [29]:
# Define functions for video recording
def show_videos(video_path="", prefix=""):          # Shows videos with specific prefixes from certain folders
    """
    Taken from https://github.com/eleurent/highway-env

    :param video_path: (str) Path to the folder containing videos
    :param prefix: (str) Filter the video, showing only the only starting with this prefix
    """
    html = []
    for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append(
            """<video alt="{}" autoplay
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>""".format(
                mp4, video_b64.decode("ascii")
            )
        )
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))


def record_video( model, env_id="LunarLander-v3", video_folder="videos/", prefix="ppo_eval", video_length=1000 ):
    """
    Record a video of a trained RL agent.

    Args:
        model: Trained RL model
        env_id (str): Gym environment ID
        video_folder (str): Directory to save videos
        video_name_prefix (str): Prefix for video file names
        video_length (int): Max number of steps to record
    """

    os.makedirs(video_folder, exist_ok=True)

    # Create environment
    env = DummyVecEnv([ lambda: gym.make(env_id, render_mode="rgb_array")])

    # Wrap with video recorder
    env = VecVideoRecorder(
        env,
        video_folder=video_folder,
        record_video_trigger=lambda step: step == 0,
        video_length=video_length,
        name_prefix=prefix
    )

    obs = env.reset()

    for _ in range(video_length):
        action, _ = model.predict(obs, deterministic=True)
        obs, _, _, _ = env.step(action)

    # Close the video recorder
    env.close()


In [None]:
# Record video for untrained agent

untrained_env = DummyVecEnv([lambda: gym.make(env_id)])
untrained_model = PPO(policy="MlpPolicy",env=untrained_env,verbose=0)

record_video(
    model=untrained_model,
    env_id= "LunarLander-v3",
    video_folder="/content/drive/MyDrive/RL/Videos",
    video_name_prefix="ppo_untrained_agent21e6",
    video_length=5000 # longer episode length
)


In [None]:
# Record video for trained agent
record_video(
    model=model,
    env_id="LunarLander-v3",
    video_folder="/content/drive/MyDrive/RL/videos",
    video_name_prefix="ppo_trained_agent_1e6",
    video_length=1000
)

In [None]:
# Display Video
show_videos("videos", prefix="ppo")

In [None]:
# Create table to plot hyperparameter values and explanations
data = {
    "Parameter": [
        "Environment",
        "Algorithm",
        "Policy Network",
        "Number of Environments (n_envs)",
        "Total Timesteps (n_timesteps)",
        "Rollout Steps (n_steps)",
        "Batch Size (batch_size)",
        "Discount Factor (gamma)",
        "GAE Lambda (gae_lambda)",
        "Number of Epochs (n_epochs)",
        "Entropy Coefficient (ent_coef)"
    ],
    "Value": [
        "LunarLander-v3",
        "PPO",
        "MlpPolicy",
        16,
        1_000_000,
        1024,
        64,
        0.999,
        0.98,
        4,
        0.01
    ],
    "Description": [
        "OpenAI Gym environment for lunar landing control",
        "Proximal Policy Optimization",
        "Actor-Critic Policy",
        "Number of environment copies running in parallel",
        "Total number of environment interactions",
        "Steps collected per environment before update",
        "Minibatch size for PPO updates",
        "Discount factor",
        "Bias–variance trade-off parameter<br>for Generalised advantage function",
        "Number of epoch when optimising the surrogate loss",
        "Entropy coefficient for the loss calculation"
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Display LEFT-ALIGNED styled table (this must be the last line)
df.style.set_properties(**{"text-align": "left"})
