<a href="https://colab.research.google.com/github/Charish53/RL_lab/blob/main/LAB05/CS22B1095_LAB05_Inbuilt_PPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade pip
!pip install gymnasium
!pip install ale-py




In [None]:
from ale_py import ALEInterface
ale = ALEInterface()

In [None]:
import gymnasium as gym
import ale_py

gym.register_envs(ale_py)

env = gym.make('ALE/Pong-v5')
action_size = env.action_space.n
state_size = env.observation_space.shape

In [None]:
!pip install stable-baselines3[extra]




In [None]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
import os
import numpy as np

# Create log directory
log_dir = "ppo_pong_logs/"
os.makedirs(log_dir, exist_ok=True)
model_dir = "ppo_pong_models/"
os.makedirs(model_dir, exist_ok=True)

def make_env(env_id, seed=0):
    """
    Create a wrapped environment for Atari games
    """
    def _init():
        env = gym.make(env_id)
        env.reset(seed=seed)
        return env
    return _init

# Create the Pong environment
env_id = "ALE/Pong-v5"
env = DummyVecEnv([make_env(env_id)])
# Stack 4 frames to capture movement
env = VecFrameStack(env, n_stack=4)

# Create an evaluation environment
eval_env = DummyVecEnv([make_env(env_id, seed=123)])
eval_env = VecFrameStack(eval_env, n_stack=4)

# Create the callback that will periodically evaluate and save the best model
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=model_dir,
    log_path=log_dir,
    eval_freq=10000,
    deterministic=True,
    render=False
)

# Create the PPO agent
model = PPO(
    "CnnPolicy",
    env,
    verbose=1,
    tensorboard_log=log_dir,
    learning_rate=2.5e-4,
    n_steps=128,
    batch_size=256,
    n_epochs=4,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.1,
    ent_coef=0.01,
    seed=0
)

# Train the agent
total_timesteps = 500000
model.learn(
    total_timesteps=total_timesteps,
    callback=eval_callback,
    tb_log_name="ppo_pong"
)

# Save the final model
model.save(f"{model_dir}/ppo_pong_final")

# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(
    model, eval_env, n_eval_episodes=10, deterministic=True
)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")



Using cuda device
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=128 and n_envs=1)


Logging to ppo_pong_logs/ppo_pong_1




----------------------------
| time/              |     |
|    fps             | 72  |
|    iterations      | 1   |
|    time_elapsed    | 1   |
|    total_timesteps | 128 |
----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 62            |
|    iterations           | 2             |
|    time_elapsed         | 4             |
|    total_timesteps      | 256           |
| train/                  |               |
|    approx_kl            | 0.00043728855 |
|    clip_fraction        | 0             |
|    clip_range           | 0.1           |
|    entropy_loss         | -1.79         |
|    explained_variance   | -0.0213       |
|    learning_rate        | 0.00025       |
|    loss                 | 1.52          |
|    n_updates            | 4             |
|    policy_gradient_loss | -0.00297      |
|    value_loss           | 4.78          |
-------------------------------------------
-----



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    total_timesteps      | 464896       |
| train/                  |              |
|    approx_kl            | 0.0039980453 |
|    clip_fraction        | 0.0879       |
|    clip_range           | 0.1          |
|    entropy_loss         | -1.38        |
|    explained_variance   | 0.732        |
|    learning_rate        | 0.00025      |
|    loss                 | -0.0341      |
|    n_updates            | 14524        |
|    policy_gradient_loss | -0.0148      |
|    value_loss           | 0.0156       |
------------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 3633        |
|    time_elapsed         | 4979        |
|    total_timesteps      | 465024      |
| train/                  |             |
|    approx_kl            | 0.000985696 |
|    clip_fraction        | 0.00781     

In [None]:
# Function to test the agent by playing a few episodes
def play_episodes(model, env, n_episodes=20, render=True):
    for episode in range(n_episodes):
        obs = env.reset()
        done = False
        total_reward = 0
        step = 0

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            total_reward += reward
            step += 1
            if render and step % 3 == 0:
                env.render()

        print(f"Episode {episode+1}: Total Reward: {total_reward[0]}")

play_episodes(model, env)



Episode 1: Total Reward: -21.0
Episode 2: Total Reward: -21.0
Episode 3: Total Reward: -20.0
Episode 4: Total Reward: -21.0
Episode 5: Total Reward: -21.0
Episode 6: Total Reward: -21.0
Episode 7: Total Reward: -19.0
Episode 8: Total Reward: -21.0
Episode 9: Total Reward: -21.0
Episode 10: Total Reward: -21.0
Episode 11: Total Reward: -19.0
Episode 12: Total Reward: -21.0
Episode 13: Total Reward: -21.0
Episode 14: Total Reward: -19.0
Episode 15: Total Reward: -21.0
Episode 16: Total Reward: -19.0
Episode 17: Total Reward: -21.0
Episode 18: Total Reward: -21.0
Episode 19: Total Reward: -20.0
Episode 20: Total Reward: -21.0
