## 1. Install Dependencies

In [73]:
import subprocess
import sys

# Upgrade gymnasium to latest version
subprocess.check_call([sys.executable, "-m", "pip", "install", "gymnasium", "--upgrade"])
!pip install swig
!pip install "gymnasium[box2d]"
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import os
from collections import deque
from datetime import datetime
import csv
from typing import Tuple, Any
from torch.distributions import Normal
import matplotlib.pyplot as plt
from gymnasium.wrappers import FrameStackObservation, GrayscaleObservation
import cv2
import warnings
warnings.filterwarnings('ignore')

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


## 3. Image Preprocessing Wrappers

In [82]:
import gymnasium as gym
import numpy as np
import cv2
from collections import deque


# ---- Action Repeat (Frame Skip) ----
class ActionRepeatWrapper(gym.Wrapper):
    """Repeat the chosen action for `repeat` frames and sum rewards."""
    
    def __init__(self, env, repeat=4):
        super().__init__(env)
        self.repeat = repeat

    def step(self, action):
        total_reward = 0.0
        terminated = False
        truncated = False
        info = {}

        for _ in range(self.repeat):
            obs, reward, terminated, truncated, info = self.env.step(action)
            total_reward += reward
            if terminated or truncated:
                break

        return obs, total_reward, terminated, truncated, info


# ---- Grayscale + Resize ----
class GrayscaleResizeWrapper(gym.ObservationWrapper):
    """Convert RGB to grayscale and resize to 84x84. Output: (1, H, W)."""
    
    def __init__(self, env, shape=(84, 84)):
        super().__init__(env)
        self.shape = shape
        
        self.observation_space = gym.spaces.Box(
            low=0,
            high=255,
            shape=(1, shape[0], shape[1]),
            dtype=np.uint8
        )
    
    def observation(self, obs):
        # Convert to grayscale
        gray = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
        # Resize to 84×84
        resized = cv2.resize(gray, self.shape, interpolation=cv2.INTER_AREA)
        # Add channel dimension (C, H, W)
        return resized[np.newaxis, :, :]


# ---- Frame Stacking ----
class FrameStackingWrapper(gym.ObservationWrapper):
    """Stack last N grayscale frames. Output: (N, H, W)."""
    
    def __init__(self, env, num_frames: int = 4):
        super().__init__(env)
        self.num_frames = num_frames

        c, h, w = env.observation_space.shape  # should be (1, 84, 84)

        self.observation_space = gym.spaces.Box(
            low=0,
            high=255,
            shape=(num_frames, h, w),
            dtype=np.uint8
        )

        self.frames = deque(maxlen=num_frames)
    
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        self.frames.clear()
        for _ in range(self.num_frames):
            self.frames.append(obs)  # obs is (1,84,84)
        return self._get_stacked(), info
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        self.frames.append(obs)
        return self._get_stacked(), reward, terminated, truncated, info
    
    def _get_stacked(self):
        # Concatenate frames along channel dimension -> (num_frames, 84, 84)
        return np.concatenate(list(self.frames), axis=0)


# ---- Full preprocessing pipeline ----
class CarRacingPreprocessor:
    """Apply preprocessing: ActionRepeat → Grayscale+Resize → FrameStack"""
    
    @staticmethod
    def apply(env: gym.Env, use_grayscale=True, num_frames=4, action_repeat=4) -> gym.Env:
        
        if action_repeat > 1:
            env = ActionRepeatWrapper(env, repeat=action_repeat)

        if use_grayscale:
            env = GrayscaleResizeWrapper(env, shape=(84, 84))

        if num_frames > 1:
            env = FrameStackingWrapper(env, num_frames=num_frames)

        return env


print("Preprocessing wrappers with action repeat, resize, grayscale, and frame stack initialized.")


Preprocessing wrappers with action repeat, resize, grayscale, and frame stack initialized.


## 4. Neural Network Models

In [87]:
def weights_init_(m):
    """Initialize network weights"""
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight, gain=1)
        torch.nn.init.constant_(m.bias, 0)


class CarRacingCNNPPOEncoder(nn.Module):
    """CNN encoder for CarRacing with grayscale frame stacking"""
    
    def __init__(self, feature_dim: int = 256, input_channels: int = 4):
        super().__init__()
        self.feature_dim = feature_dim
        
        self.conv = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=8, stride=4),  # -> 32x23x23
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),              # -> 64x10x10
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),              # -> 64x8x8
            nn.ReLU(inplace=True),
        )
        
        conv_out_dim = 64 * 8 * 8  # 4096
        self.fc = nn.Linear(conv_out_dim, feature_dim)
        
        self.apply(weights_init_)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if x.dim() == 3:  # (C, H, W)
            x = x.unsqueeze(0)
        
        x = x.float() / 255.0
        x = self.conv(x)
        x = x.reshape(x.size(0), -1)
        x = F.relu(self.fc(x))
        
        return x


class PPOActorCNN(nn.Module):
    """Actor network for PPO with CNN encoder"""
    
    def __init__(self, encoder: CarRacingCNNPPOEncoder, action_dim: int, hidden_dim: int = 256):
        super().__init__()
        self.encoder = encoder
        
        self.fc1 = nn.Linear(encoder.feature_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.mean = nn.Linear(hidden_dim, action_dim)
        self.log_std = nn.Parameter(torch.zeros(action_dim))
        
        self.apply(weights_init_)
    
    def forward(self, state):
        features = self.encoder(state)
        x = torch.tanh(self.fc1(features))
        x = torch.tanh(self.fc2(x))
        mean = self.mean(x)
        std = self.log_std.exp().expand_as(mean)
        return mean, std
    
    def get_dist(self, state):
        mean, std = self.forward(state)
        return Normal(mean, std)


class PPOCriticCNN(nn.Module):
    """Critic network for PPO with CNN encoder"""
    
    def __init__(self, encoder: CarRacingCNNPPOEncoder, hidden_dim: int = 256):
        super().__init__()
        self.encoder = encoder
        
        self.fc1 = nn.Linear(encoder.feature_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.value = nn.Linear(hidden_dim, 1)
        
        self.apply(weights_init_)
    
    def forward(self, state):
        features = self.encoder(state)
        x = torch.tanh(self.fc1(features))

def weights_init_(m):
    """Initialize network weights"""
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight, gain=1)
        torch.nn.init.constant_(m.bias, 0)


class CarRacingCNNPPOEncoder(nn.Module):
    """CNN encoder for CarRacing with grayscale 84x84 frames and frame stacking"""
    
    def __init__(self, feature_dim: int = 256, input_channels: int = 4):
        super().__init__()
        self.feature_dim = feature_dim
        
        self.conv = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=8, stride=4),  # 32 x 20 x 20
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),              # 64 x 9 x 9
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),              # 64 x 7 x 7
            nn.ReLU(inplace=True),
        )
        
        conv_out_dim = 64 * 7 * 7  # 3136
        self.fc = nn.Linear(conv_out_dim, feature_dim)
        
        self.apply(weights_init_)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if x.dim() == 3:  # (C, H, W)
            x = x.unsqueeze(0)
        
        x = x.float() / 255.0  # normalize
        x = self.conv(x)
        x = x.reshape(x.size(0), -1)
        x = F.relu(self.fc(x))
        
        return x


class PPOActorCNN(nn.Module):
    """Actor network for PPO with CNN encoder"""
    
    def __init__(self, encoder: CarRacingCNNPPOEncoder, action_dim: int, hidden_dim: int = 256):
        super().__init__()
        self.encoder = encoder
        
        self.fc1 = nn.Linear(encoder.feature_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.mean = nn.Linear(hidden_dim, action_dim)
        self.log_std = nn.Parameter(torch.zeros(action_dim))
        
        self.apply(weights_init_)
    
    def forward(self, state):
        features = self.encoder(state)
        x = torch.tanh(self.fc1(features))
        x = torch.tanh(self.fc2(x))
        mean = self.mean(x)
        std = self.log_std.exp().expand_as(mean)
        return mean, std
    
    def get_dist(self, state):
        mean, std = self.forward(state)
        return Normal(mean, std)


class PPOCriticCNN(nn.Module):
    """Critic network for PPO with CNN encoder"""
    
    def __init__(self, encoder: CarRacingCNNPPOEncoder, hidden_dim: int = 256):
        super().__init__()
        self.encoder = encoder
        
        self.fc1 = nn.Linear(encoder.feature_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.value = nn.Linear(hidden_dim, 1)
        
        self.apply(weights_init_)
    
    def forward(self, state):
        features = self.encoder(state)
        x = torch.tanh(self.fc1(features))
        x = torch.tanh(self.fc2(x))
        value = self.value(x)
        return value

print("Network models initialized for 84x84 grayscale stacked frames")


Network models initialized for 84x84 grayscale stacked frames


## 5. Rollout Buffer

In [76]:
class RolloutBuffer:
    """Buffer for storing trajectories for PPO"""
    
    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.log_probs = []
        self.values = []
    
    def add(self, state, action, reward, done, log_prob, value):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.dones.append(done)
        self.log_probs.append(log_prob)
        self.values.append(value)
    
    def get(self):
        return (
            self.states,
            self.actions,
            self.rewards,
            self.dones,
            self.log_probs,
            self.values
        )
    
    def clear(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.log_probs = []
        self.values = []

print("Rollout buffer initialized")

Rollout buffer initialized


## 6. PPO Agent with CNN

In [88]:
class PPOAgentCNN:
    """PPO Agent with CNN encoder for image-based observations (84x84 grayscale + frame stacking)"""
    
    def __init__(self, action_dim, hyperparameters, device, feature_dim=256, num_frames=4):
        self.device = device
        self.action_dim = action_dim
        self.feature_dim = feature_dim
        self.input_channels = num_frames  # match frame stack
        
        # Hyperparameters
        self.gamma = hyperparameters.get('gamma', 0.99)
        self.gae_lambda = hyperparameters.get('gae_lambda', 0.95)
        self.lr = hyperparameters.get('lr', 2.5e-4)
        self.clip_epsilon = hyperparameters.get('clip_epsilon', 0.2)
        self.value_loss_coef = hyperparameters.get('value_loss_coef', 0.5)
        self.entropy_coef = hyperparameters.get('entropy_coef', 0.01)
        self.max_grad_norm = hyperparameters.get('max_grad_norm', 0.5)
        self.ppo_epochs = hyperparameters.get('ppo_epochs', 10)
        self.mini_batch_size = hyperparameters.get('mini_batch_size', 64)
        self.hidden_dim = hyperparameters.get('hidden_dim', 256)
        
        # Encoder expects input shape: (num_frames, 84, 84)
        self.encoder = CarRacingCNNPPOEncoder(
            feature_dim=feature_dim,
            input_channels=self.input_channels
        ).to(device)
        
        # Actor and Critic
        self.actor = PPOActorCNN(self.encoder, action_dim, self.hidden_dim).to(device)
        self.critic = PPOCriticCNN(self.encoder, self.hidden_dim).to(device)
        
        # Optimizer
        self.optimizer = optim.Adam(
            list(self.actor.parameters()) + list(self.critic.parameters()),
            lr=self.lr
        )
        
        # Rollout buffer
        self.buffer = RolloutBuffer()
    
    def select_action(self, state, eval_mode=False):
        # Ensure state has shape (1, C, H, W)
        if isinstance(state, np.ndarray):
            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        else:
            state = state.unsqueeze(0).to(self.device)
        
        with torch.no_grad():
            if eval_mode:
                mean, _ = self.actor(state)
                action = mean
                return action.cpu().numpy()[0]
            else:
                dist = self.actor.get_dist(state)
                action = dist.sample()
                log_prob = dist.log_prob(action).sum(dim=-1)
                value = self.critic(state)
                return action.cpu().numpy()[0], log_prob.cpu().item(), value.cpu().item()
    
    def store_transition(self, state, action, reward, done, log_prob, value):
        self.buffer.add(state, action, reward, done, log_prob, value)
    
    def compute_gae(self, rewards, values, dones, next_value):
        advantages = []
        gae = 0
        values = values + [next_value]
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + self.gamma * values[t + 1] * (1 - dones[t]) - values[t]
            gae = delta + self.gamma * self.gae_lambda * (1 - dones[t]) * gae
            advantages.insert(0, gae)
        return advantages
    
    def update(self, next_state):
        states, actions, rewards, dones, old_log_probs, values = self.buffer.get()
        
        # Compute next value
        with torch.no_grad():
            if isinstance(next_state, np.ndarray):
                next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0).to(self.device)
            else:
                next_state_tensor = next_state.unsqueeze(0).to(self.device)
            next_value = self.critic(next_state_tensor).cpu().item()
        
        advantages = self.compute_gae(rewards, values, dones, next_value)
        
        # Convert to tensors
        states = torch.FloatTensor(np.array(states)).to(self.device)
        actions = torch.FloatTensor(np.array(actions)).to(self.device)
        old_log_probs = torch.FloatTensor(old_log_probs).to(self.device)
        advantages = torch.FloatTensor(advantages).to(self.device)
        
        # Normalize advantages
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
        returns = advantages + torch.FloatTensor(values).to(self.device)
        
        dataset_size = states.size(0)
        total_policy_loss, total_value_loss, total_entropy = 0, 0, 0
        update_count = 0
        
        # PPO update loop
        for _ in range(self.ppo_epochs):
            indices = np.random.permutation(dataset_size)
            for start in range(0, dataset_size, self.mini_batch_size):
                end = min(start + self.mini_batch_size, dataset_size)
                if end - start < 1:
                    continue
                idx = indices[start:end]
                mb_states = states[idx]
                mb_actions = actions[idx]
                mb_old_log_probs = old_log_probs[idx]
                mb_advantages = advantages[idx]
                mb_returns = returns[idx]
                
                dist = self.actor.get_dist(mb_states)
                new_log_probs = dist.log_prob(mb_actions).sum(dim=-1)
                entropy = dist.entropy().sum(dim=-1).mean()
                
                values_pred = self.critic(mb_states).squeeze()
                
                ratio = torch.exp(new_log_probs - mb_old_log_probs)
                surr1 = ratio * mb_advantages
                surr2 = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * mb_advantages
                policy_loss = -torch.min(surr1, surr2).mean()
                
                value_loss = F.mse_loss(values_pred, mb_returns)
                
                loss = policy_loss + self.value_loss_coef * value_loss - self.entropy_coef * entropy
                
                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    list(self.actor.parameters()) + list(self.critic.parameters()),
                    self.max_grad_norm
                )
                self.optimizer.step()
                
                total_policy_loss += policy_loss.item()
                total_value_loss += value_loss.item()
                total_entropy += entropy.item()
                update_count += 1
        
        self.buffer.clear()
        
        return {
            'policy_loss': total_policy_loss / update_count if update_count > 0 else 0,
            'value_loss': total_value_loss / update_count if update_count > 0 else 0,
            'entropy': total_entropy / update_count if update_count > 0 else 0
        }
    
    def save(self, filepath):
        os.makedirs(os.path.dirname(filepath) or '.', exist_ok=True)
        torch.save({
            'encoder_state_dict': self.encoder.state_dict(),
            'actor_state_dict': self.actor.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
        }, filepath)
    
    def load(self, filepath):
        checkpoint = torch.load(filepath, map_location=self.device)
        self.encoder.load_state_dict(checkpoint['encoder_state_dict'])
        self.actor.load_state_dict(checkpoint['actor_state_dict'])
        self.critic.load_state_dict(checkpoint['critic_state_dict'])
        if 'optimizer_state_dict' in checkpoint:
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

print("PPO Agent initialized (84x84 + frame stack)") 


PPO Agent initialized (84x84 + frame stack)


In [89]:
# Hyperparameters for PPO on CarRacing-v3
HYPERPARAMETERS = {
    'lr': 2.5e-4,                    # Learning rate (lower for image-based tasks)
    'gamma': 0.99,                 # Discount factor
    'gae_lambda': 0.95,            # GAE parameter
    'clip_epsilon': 0.2,           # PPO clipping parameter
    'entropy_coef': 0.01,         # Entropy coefficient (exploration)
    'value_loss_coef': 0.5,        # Value loss weight
    'max_grad_norm': 0.5,          # Gradient clipping
    'ppo_epochs': 10,              # PPO update epochs
    'mini_batch_size': 256,         # Mini-batch size
    'hidden_dim': 512,             # Network hidden dimension
}

TRAINING_CONFIG = TRAINING_CONFIG = {
    'total_timesteps': 375_000,   # Agent steps (1.5M env frames / frameskip=4)
    'eval_frequency': 25_000,     # Evaluate every N agent steps
    'eval_episodes': 10,           # Number of eval episodes
    'save_frequency': 25_000,      # Save every N agent steps
    'rollout_length': 4096,        # PPO rollout buffer length
}


print("Configuration loaded")
print(f"Hyperparameters: {HYPERPARAMETERS}")
print(f"Training config: {TRAINING_CONFIG}")

Configuration loaded
Hyperparameters: {'lr': 0.00025, 'gamma': 0.99, 'gae_lambda': 0.95, 'clip_epsilon': 0.2, 'entropy_coef': 0.01, 'value_loss_coef': 0.5, 'max_grad_norm': 0.5, 'ppo_epochs': 10, 'mini_batch_size': 256, 'hidden_dim': 512}
Training config: {'total_timesteps': 375000, 'eval_frequency': 25000, 'eval_episodes': 10, 'save_frequency': 25000, 'rollout_length': 4096}


## 7. Hyperparameters Configuration

## 8. Setup Environment and Agent

In [90]:
# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Number of frames to stack
NUM_FRAMES = 4

# Create training environment with preprocessing (grayscale, 84x84, frame stack, optional frameskip)
env = gym.make('CarRacing-v3', continuous=True, render_mode=None)
env = CarRacingPreprocessor.apply(env, use_grayscale=True, num_frames=NUM_FRAMES)

# Create evaluation environment
eval_env = gym.make('CarRacing-v3', continuous=True, render_mode=None)
eval_env = CarRacingPreprocessor.apply(eval_env, use_grayscale=True, num_frames=NUM_FRAMES)

# Get action dimension
action_dim = env.action_space.shape[0]

print(f"\nEnvironment: CarRacing-v3")
print(f"Action dimension: {action_dim}")
print(f"Observation shape: {env.observation_space.shape}")  # should be (num_frames, 84, 84)
print(f"Action space: {env.action_space}")

# Initialize PPO agent
agent = PPOAgentCNN(
    action_dim=action_dim,
    hyperparameters=HYPERPARAMETERS,
    device=device,
    feature_dim=256,
    num_frames=NUM_FRAMES  # matches stacked frames
)

print("\nAgent initialized successfully!")
print(f"Actor parameters: {sum(p.numel() for p in agent.actor.parameters())}")
print(f"Critic parameters: {sum(p.numel() for p in agent.critic.parameters())}")


Using device: cuda

Environment: CarRacing-v3
Action dimension: 3
Observation shape: (4, 84, 84)
Action space: Box([-1.  0.  0.], 1.0, (3,), float32)

Agent initialized successfully!
Actor parameters: 1276838
Critic parameters: 1275809


## 9. Training Loop

In [91]:
# Training parameters
total_timesteps = TRAINING_CONFIG['total_timesteps']
eval_frequency = TRAINING_CONFIG['eval_frequency']
eval_episodes = TRAINING_CONFIG['eval_episodes']
save_frequency = TRAINING_CONFIG['save_frequency']
rollout_length = TRAINING_CONFIG['rollout_length']

# Initialize training variables
state, _ = env.reset()  # Gym v3 returns (obs, info)
state = np.array(state, dtype=np.uint8)  # Ensure shape matches CNN input
episode_reward = 0
episode_length = 0
episodes_trained = 0
episode_rewards = []
timestep = 0
best_eval_reward = -np.inf

# Training metrics
training_metrics = {
    'timesteps': [],
    'episode_rewards': [],
    'eval_rewards': [],
    'policy_losses': [],
    'value_losses': [],
    'entropies': []
}

print("\n" + "="*60)
print("Starting Training...")
print("="*60)
print(f"Total timesteps: {total_timesteps:,}")
print(f"Rollout length: {rollout_length:,}")
print(f"Observation shape: {state.shape}")
print(f"Device: {device}\n")



Starting Training...
Total timesteps: 375,000
Rollout length: 4,096
Observation shape: (4, 84, 84)
Device: cuda



## 10. Main Training Loop (Run this cell for training)

In [None]:
# Main training loop
while timestep < total_timesteps:
    # Rollout for rollout_length steps
    for step in range(rollout_length):
        # Select action
        action, log_prob, value = agent.select_action(state, eval_mode=False)
        
        # Clip actions to valid range
        action = np.clip(action, env.action_space.low, env.action_space.high)
        
        # Step environment
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        
        # Convert next_state to uint8 and match CNN input
        next_state = np.array(next_state, dtype=np.uint8)
        
        # Store transition
        agent.store_transition(state, action, reward, done, log_prob, value)
        
        # Update metrics
        episode_reward += reward
        episode_length += 1
        timestep += 1
        
        state = next_state
        
        # Episode reset
        if done:
            episode_rewards.append(episode_reward)
            episodes_trained += 1
            
            print(f"Episode {episodes_trained}: Reward={episode_reward:.2f}, "
                  f"Length={episode_length}, Timestep={timestep}")
            
            episode_reward = 0
            episode_length = 0
            state, _ = env.reset()
            state = np.array(state, dtype=np.uint8)
        
        # Evaluation
        if timestep % eval_frequency == 0 and timestep > 0:
            print(f"\n--- Evaluation at timestep {timestep:,} ---")
            eval_rewards = []
            
            for _ in range(eval_episodes):
                eval_state, _ = eval_env.reset()
                eval_state = np.array(eval_state, dtype=np.uint8)
                eval_episode_reward = 0
                eval_done = False
                
                while not eval_done:
                    eval_action = agent.select_action(eval_state, eval_mode=True)
                    eval_action = np.clip(eval_action, eval_env.action_space.low, eval_env.action_space.high)
                    eval_next_state, eval_reward, eval_terminated, eval_truncated, _ = eval_env.step(eval_action)
                    eval_done = eval_terminated or eval_truncated
                    eval_episode_reward += eval_reward
                    eval_state = np.array(eval_next_state, dtype=np.uint8)
                
                eval_rewards.append(eval_episode_reward)
            
            eval_mean = np.mean(eval_rewards)
            eval_std = np.std(eval_rewards)
            print(f"Eval Mean: {eval_mean:.2f} ± {eval_std:.2f}")
            print(f"Eval Rewards: {eval_rewards}\n")
            
            training_metrics['timesteps'].append(timestep)
            training_metrics['eval_rewards'].append(eval_mean)
            
            # Save best model
            if eval_mean > best_eval_reward:
                best_eval_reward = eval_mean
                os.makedirs('models', exist_ok=True)
                model_path = 'models/PPO_CarRacing_best.pth'
                agent.save(model_path)
                print(f"Saved best model to {model_path}\n")
        
        if timestep >= total_timesteps:
            break
    
    # PPO update
    if timestep % rollout_length == 0 or timestep >= total_timesteps:
        print(f"Timestep {timestep}: PPO Update...")
        update_info = agent.update(state)
        
        print(f"  Policy Loss: {update_info['policy_loss']:.4f}")
        print(f"  Value Loss: {update_info['value_loss']:.4f}")
        print(f"  Entropy: {update_info['entropy']:.4f}\n")
        
        training_metrics['policy_losses'].append(update_info['policy_loss'])
        training_metrics['value_losses'].append(update_info['value_loss'])
        training_metrics['entropies'].append(update_info['entropy'])
        
        # Periodic save
        if timestep % save_frequency == 0 and timestep > 0:
            os.makedirs('models', exist_ok=True)
            checkpoint_path = f"models/PPO_CarRacing_checkpoint_{timestep}.pth"
            agent.save(checkpoint_path)
            print(f"Saved checkpoint to {checkpoint_path}\n")

# Final save
os.makedirs('models', exist_ok=True)
final_model_path = 'models/PPO_CarRacing_final.pth'
agent.save(final_model_path)
print(f"\nTraining complete! Final model saved to {final_model_path}")


Episode 1: Reward=-45.21, Length=250, Timestep=250
Episode 2: Reward=-51.61, Length=250, Timestep=500
Episode 3: Reward=-2.68, Length=250, Timestep=750
Episode 4: Reward=-48.72, Length=250, Timestep=1000
Episode 5: Reward=-7.53, Length=250, Timestep=1250
Episode 6: Reward=-56.67, Length=250, Timestep=1500
Episode 7: Reward=-34.58, Length=250, Timestep=1750
Episode 8: Reward=-65.03, Length=250, Timestep=2000
Episode 9: Reward=-68.31, Length=250, Timestep=2250
Episode 10: Reward=-25.17, Length=250, Timestep=2500
Episode 11: Reward=-73.24, Length=250, Timestep=2750
Episode 12: Reward=-32.10, Length=250, Timestep=3000
Episode 13: Reward=-49.79, Length=250, Timestep=3250
Episode 14: Reward=-34.55, Length=250, Timestep=3500
Episode 15: Reward=-50.76, Length=250, Timestep=3750
Episode 16: Reward=-55.39, Length=250, Timestep=4000
Timestep 4096: PPO Update...
  Policy Loss: 0.1748
  Value Loss: 0.5091
  Entropy: 4.2542

Episode 17: Reward=-9.42, Length=250, Timestep=4250
Episode 18: Reward=-36.

## 11. Save Training Statistics

In [None]:
# Save episode rewards
os.makedirs('results', exist_ok=True)
results_file = 'results/PPO_CarRacing_training_rewards.csv'

with open(results_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Episode', 'Reward'])
    for i, reward in enumerate(episode_rewards):
        writer.writerow([i + 1, reward])

print(f"Training rewards saved to {results_file}")
print(f"Total episodes trained: {episodes_trained}")
print(f"Total timesteps: {timestep:,}")
print(f"\nTraining Statistics:")
if episode_rewards:
    print(f"Mean episode reward: {np.mean(episode_rewards):.2f}")
    print(f"Std episode reward: {np.std(episode_rewards):.2f}")
    print(f"Max episode reward: {np.max(episode_rewards):.2f}")
    print(f"Min episode reward: {np.min(episode_rewards):.2f}")

## 12. Visualization of Training Progress

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Episode rewards over time
axes[0, 0].plot(episode_rewards, label='Episode Reward')
axes[0, 0].set_xlabel('Episode')
axes[0, 0].set_ylabel('Reward')
axes[0, 0].set_title('Episode Rewards During Training')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].legend()

# Running average of episode rewards
if episode_rewards:
    window = min(50, len(episode_rewards))
    running_avg = np.convolve(episode_rewards, np.ones(window)/window, mode='valid')
    axes[0, 1].plot(running_avg, label=f'Running Average (window={window})', color='orange')
    axes[0, 1].set_xlabel('Episode')
    axes[0, 1].set_ylabel('Reward')
    axes[0, 1].set_title('Running Average of Episode Rewards')
    axes[0, 1].grid(True, alpha=0.3)
    axes[0, 1].legend()

# Policy and Value losses
if training_metrics['policy_losses']:
    axes[1, 0].plot(training_metrics['policy_losses'], label='Policy Loss', marker='o')
    axes[1, 0].plot(training_metrics['value_losses'], label='Value Loss', marker='s')
    axes[1, 0].set_xlabel('Update')
    axes[1, 0].set_ylabel('Loss')
    axes[1, 0].set_title('Policy and Value Losses')
    axes[1, 0].grid(True, alpha=0.3)
    axes[1, 0].legend()
    axes[1, 0].set_yscale('log')

# Evaluation rewards
if training_metrics['eval_rewards']:
    axes[1, 1].plot(training_metrics['timesteps'], training_metrics['eval_rewards'], 
                     label='Eval Mean Reward', marker='o', color='green', linewidth=2)
    axes[1, 1].set_xlabel('Timestep')
    axes[1, 1].set_ylabel('Reward')
    axes[1, 1].set_title('Evaluation Mean Rewards')
    axes[1, 1].grid(True, alpha=0.3)
    axes[1, 1].legend()

plt.tight_layout()
plt.savefig('results/training_progress.png', dpi=150)
plt.show()

print("Training progress visualization saved to results/training_progress.png")

## 13. Test Trained Agent

In [None]:
# Load the best trained model
test_env = gym.make('CarRacing-v3', continuous=True, render_mode=None)
test_env = CarRacingPreprocessor.apply(test_env, use_grayscale=True, num_frames=4)

# Create agent for testing
test_agent = PPOAgentCNN(
    action_dim=test_env.action_space.shape[0],
    hyperparameters=HYPERPARAMETERS,
    device=device,
    feature_dim=256,
    input_channels=4
)

# Load best model
model_path = 'models/PPO_CarRacing_best.pth'
if os.path.exists(model_path):
    test_agent.load(model_path)
    print(f"Loaded model from {model_path}")
else:
    print(f"Warning: Model file not found at {model_path}")
    print("Using current agent for testing")

# Test for 5 episodes
test_episodes = 100
test_rewards = []

print(f"\nTesting trained agent for {test_episodes} episodes...\n")

for ep in range(test_episodes):
    test_state, _ = test_env.reset()
    test_reward = 0
    test_done = False
    
    while not test_done:
        test_action = test_agent.select_action(test_state, eval_mode=True)
        test_action = np.clip(test_action, test_env.action_space.low, test_env.action_space.high)
        test_next_state, test_r, test_terminated, test_truncated, _ = test_env.step(test_action)
        test_done = test_terminated or test_truncated
        test_reward += test_r
        test_state = test_next_state
    
    test_rewards.append(test_reward)
    print(f"Test Episode {ep+1}: Reward = {test_reward:.2f}")

test_env.close()

print(f"\nTest Statistics:")
print(f"Mean test reward: {np.mean(test_rewards):.2f}")
print(f"Std test reward: {np.std(test_rewards):.2f}")
print(f"Max test reward: {np.max(test_rewards):.2f}")
print(f"Min test reward: {np.min(test_rewards):.2f}")

## 14. Summary and Notes

### Training Complete!

**Key Achievements:**
- Implemented PPO with CNN encoder for visual input processing
- Applied grayscale conversion and frame stacking for efficient observation handling
- Successfully trained agent on CarRacing-v3 environment
- Saved checkpoints and best models
- Evaluated agent performance

**Model Architecture:**
- **CNN Encoder**: 4 input channels (stacked grayscale frames) → 256-dim feature vector
- **Actor Network**: 256-dim input → 2 hidden layers (256) → continuous actions
- **Critic Network**: 256-dim input → 2 hidden layers (256) → scalar value

**Next Steps for Kaggle Training:**
1. Adjust `total_timesteps` for longer training (2-3M steps recommended)
2. Fine-tune learning rate and entropy coefficient
3. Consider curriculum learning or reward shaping
4. Monitor GPU memory usage
5. Save regular checkpoints for recovery

**Files Generated:**
- `models/PPO_CarRacing_best.pth` - Best model by evaluation reward
- `models/PPO_CarRacing_final.pth` - Final model after training
- `results/PPO_CarRacing_training_rewards.csv` - Episode rewards log
- `results/training_progress.png` - Training visualization