In [1]:
import os
import math
import numpy as np
import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from typing import List, Dict, Tuple
import matplotlib.pyplot as plt
from tqdm import tqdm
from waymo_open_dataset.protos import end_to_end_driving_data_pb2 as wod_e2ed_pb2
from waymo_open_dataset.protos import end_to_end_driving_submission_pb2 as wod_e2ed_submission_pb2

2025-04-08 20:20:26.038352: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-08 20:20:26.039777: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-08 20:20:26.067672: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-08 20:20:26.068173: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Constants
NUM_CAMERAS = 8  # 8 cameras for 360-degree view
IMAGE_HEIGHT = 512
IMAGE_WIDTH = 512
NUM_PAST_STEPS = 16  # 4 seconds of past data at 4Hz
NUM_FUTURE_STEPS = 20  # 5 seconds of future data at 4Hz
BATCH_SIZE = 8
LEARNING_RATE = 3e-4
NUM_EPOCHS = 30

In [None]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class WaymoE2EDataset(Dataset):
    """Dataset for Waymo End-to-End Driving Challenge."""
    
    def __init__(self, tfrecord_files, transform=None):
        """
        Args:
            tfrecord_files: List of paths to TFRecord files
            transform: Optional transform to be applied on images
        """
        self.tfrecord_files = tfrecord_files
        self.transform = transform
        self.examples = self._load_examples()
        
    def _load_examples(self):
        """Load and parse examples from TFRecord files."""
        examples = []
        
        # Create dataset from TFRecord files
        dataset = tf.data.TFRecordDataset(self.tfrecord_files, compression_type='')
        
        # Parse each example and store necessary information
        for raw_example in tqdm(dataset, desc="Loading dataset"):
            data = wod_e2ed_pb2.E2EDFrame()
            data.ParseFromString(raw_example.numpy())
            
            example = {
                'frame_name': data.frame.context.name,
                'images': [self._decode_image(img.image) for img in data.frame.images],
                'past_states': {
                    'pos_x': np.array(data.past_states.pos_x, dtype=np.float32),
                    'pos_y': np.array(data.past_states.pos_y, dtype=np.float32),
                    'vel_x': np.array(data.past_states.vel_x, dtype=np.float32) if data.past_states.vel_x else np.zeros(NUM_PAST_STEPS, dtype=np.float32),
                    'vel_y': np.array(data.past_states.vel_y, dtype=np.float32) if data.past_states.vel_y else np.zeros(NUM_PAST_STEPS, dtype=np.float32)
                },
                'future_states': {
                    'pos_x': np.array(data.future_states.pos_x, dtype=np.float32),
                    'pos_y': np.array(data.future_states.pos_y, dtype=np.float32)
                },
                'intent': data.intent
            }
            
            examples.append(example)
        
        return examples
    
    def _decode_image(self, image_bytes):
        """Decode image bytes to numpy array."""
        image = tf.io.decode_image(image_bytes).numpy()
        
        if self.transform:
            image = self.transform(image)
            
        return image
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        example = self.examples[idx]
        
        # Stack images for batch processing
        images = torch.stack([torch.from_numpy(img).permute(2, 0, 1).float() / 255.0 
                             for img in example['images']])
        
        # Create past trajectory tensor [past_steps, features]
        past_trajectory = np.stack([
            example['past_states']['pos_x'], 
            example['past_states']['pos_y'],
            example['past_states']['vel_x'],
            example['past_states']['vel_y']
        ], axis=1)
        
        # Create future trajectory tensor [future_steps, 2] (only positions x,y)
        future_trajectory = np.stack([
            example['future_states']['pos_x'],
            example['future_states']['pos_y']
        ], axis=1)
        
        # One-hot encode intent
        intent = np.zeros(4, dtype=np.float32)  # 4 for UNKNOWN, GO_STRAIGHT, GO_LEFT, GO_RIGHT
        intent[example['intent']] = 1.0
        
        return {
            'frame_name': example['frame_name'],
            'images': images,
            'past_trajectory': torch.from_numpy(past_trajectory).float(),
            'future_trajectory': torch.from_numpy(future_trajectory).float(),
            'intent': torch.from_numpy(intent).float()
        }


In [5]:
class ImageEncoder(nn.Module):
    """CNN encoder for processing camera images."""
    
    def __init__(self, output_dim=256):
        super(ImageEncoder, self).__init__()
        # Use ResNet18 as the backbone, removing the final classification layer
        resnet = models.resnet18(pretrained=True)
        self.backbone = nn.Sequential(*list(resnet.children())[:-1])
        
        # Project features to the desired output dimension
        self.projection = nn.Linear(512, output_dim)
        
    def forward(self, x):
        """
        Args:
            x: Tensor of shape [batch_size, num_cameras, channels, height, width]
        
        Returns:
            Tensor of shape [batch_size, num_cameras, output_dim]
        """
        batch_size, num_cameras = x.shape[0], x.shape[1]
        
        # Reshape for processing each image independently
        x = x.view(batch_size * num_cameras, 3, IMAGE_HEIGHT, IMAGE_WIDTH)
        
        # Extract features
        features = self.backbone(x)
        features = features.view(batch_size * num_cameras, -1)
        
        # Project features
        features = self.projection(features)
        
        # Reshape back to [batch_size, num_cameras, output_dim]
        features = features.view(batch_size, num_cameras, -1)
        
        return features


In [6]:
class TrajectoryEncoder(nn.Module):
    """Encoder for processing past trajectory data."""
    
    def __init__(self, input_dim=4, hidden_dim=128, output_dim=256):
        super(TrajectoryEncoder, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_dim, 
            hidden_size=hidden_dim, 
            num_layers=2, 
            batch_first=True,
            bidirectional=True
        )
        self.projection = nn.Linear(hidden_dim * 2, output_dim)
        
    def forward(self, x):
        """
        Args:
            x: Tensor of shape [batch_size, past_steps, features]
        
        Returns:
            Tensor of shape [batch_size, output_dim]
        """
        # Process the trajectory sequence
        output, (hidden, _) = self.lstm(x)
        
        # Use the final hidden state from both directions
        hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
        
        # Project to output dimension
        hidden = self.projection(hidden)
        
        return hidden

In [7]:
class TransformerFusion(nn.Module):
    """Transformer for fusing multi-modal features."""
    
    def __init__(self, input_dim=256, num_heads=8, num_layers=4):
        super(TransformerFusion, self).__init__()
        
        # Transformer encoder layer
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim,
            nhead=num_heads,
            dim_feedforward=input_dim * 4,
            batch_first=True
        )
        
        # Stack multiple encoder layers
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
    def forward(self, image_features, trajectory_features, intent_features):
        """
        Args:
            image_features: Tensor of shape [batch_size, num_cameras, feature_dim]
            trajectory_features: Tensor of shape [batch_size, feature_dim]
            intent_features: Tensor of shape [batch_size, feature_dim]
            
        Returns:
            Tensor of shape [batch_size, (num_cameras + 2), feature_dim]
        """
        batch_size = image_features.shape[0]
        feature_dim = image_features.shape[2]
        
        # Reshape trajectory and intent features to match image_features dimensions
        trajectory_features = trajectory_features.view(batch_size, 1, feature_dim)
        intent_features = intent_features.view(batch_size, 1, feature_dim)
        
        # Concatenate all features along the sequence dimension
        # [batch_size, num_cameras + 2, feature_dim]
        features = torch.cat([image_features, trajectory_features, intent_features], dim=1)
        
        # Apply transformer to fuse features
        fused_features = self.transformer(features)
        
        return fused_features

In [8]:
class TrajectoryDecoder(nn.Module):
    """Decoder for predicting future trajectory."""
    
    def __init__(self, input_dim=256, hidden_dim=512, output_steps=20):
        super(TrajectoryDecoder, self).__init__()
        
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_steps * 2)  # 2 for (x, y) coordinates
        )
        
        self.output_steps = output_steps
        
    def forward(self, x):
        """
        Args:
            x: Tensor of shape [batch_size, feature_dim]
            
        Returns:
            Tensor of shape [batch_size, output_steps, 2]
        """
        # Apply MLP to get flattened trajectory
        trajectory = self.mlp(x)
        
        # Reshape to [batch_size, output_steps, 2]
        trajectory = trajectory.view(-1, self.output_steps, 2)
        
        return trajectory

In [9]:
class IntentEncoder(nn.Module):
    """Encoder for processing intent data."""
    
    def __init__(self, input_dim=4, output_dim=256):
        super(IntentEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )
        
    def forward(self, x):
        """
        Args:
            x: Tensor of shape [batch_size, 4] (one-hot encoded intent)
            
        Returns:
            Tensor of shape [batch_size, output_dim]
        """
        return self.encoder(x)

In [12]:
class E2EDrivingModel(nn.Module):
    """End-to-end driving model that predicts future trajectory."""
    
    def __init__(self, feature_dim=256):
        super(E2EDrivingModel, self).__init__()
        
        # Encoders
        self.image_encoder = ImageEncoder(output_dim=feature_dim)
        self.trajectory_encoder = TrajectoryEncoder(output_dim=feature_dim)
        self.intent_encoder = IntentEncoder(output_dim=feature_dim)
        
        # Feature fusion
        self.transformer = TransformerFusion(input_dim=feature_dim)
        
        # Trajectory prediction
        self.decoder = TrajectoryDecoder(input_dim=feature_dim)
        
        # Global feature aggregation
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        
    def forward(self, images, past_trajectory, intent):
        """
        Args:
            images: Tensor of shape [batch_size, num_cameras, channels, height, width]
            past_trajectory: Tensor of shape [batch_size, past_steps, features]
            intent: Tensor of shape [batch_size, 4] (one-hot encoded)
            
        Returns:
            Tensor of shape [batch_size, future_steps, 2]
        """
        # Encode inputs
        image_features = self.image_encoder(images)
        trajectory_features = self.trajectory_encoder(past_trajectory)
        intent_features = self.intent_encoder(intent)
        
        # Fuse features with transformer
        fused_features = self.transformer(image_features, trajectory_features, intent_features)
        
        # Global pooling to get a single feature vector per batch
        # [batch_size, num_features + 2, feature_dim] -> [batch_size, feature_dim]
        fused_features = fused_features.transpose(1, 2)
        pooled_features = self.global_pool(fused_features).squeeze(2)
        
        # Decode to get future trajectory
        future_trajectory = self.decoder(pooled_features)
        
        return future_trajectory

In [13]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
    """Train the model."""
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
            # Move data to device
            images = batch['images'].to(device)
            past_trajectory = batch['past_trajectory'].to(device)
            intent = batch['intent'].to(device)
            future_trajectory = batch['future_trajectory'].to(device)
            
            # Forward pass
            predictions = model(images, past_trajectory, intent)
            
            # Compute loss
            loss = criterion(predictions, future_trajectory)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Calculate average training loss
        train_loss /= len(train_loader)
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
                # Move data to device
                images = batch['images'].to(device)
                past_trajectory = batch['past_trajectory'].to(device)
                intent = batch['intent'].to(device)
                future_trajectory = batch['future_trajectory'].to(device)
                
                # Forward pass
                predictions = model(images, past_trajectory, intent)
                
                # Compute loss
                loss = criterion(predictions, future_trajectory)
                
                val_loss += loss.item()
            
            # Calculate average validation loss
            val_loss /= len(val_loader)
        
        # Print progress
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_e2e_driving_model.pth')
            print("Best model saved!")
    
    return model

In [11]:
def evaluate_model(model, test_loader, device):
    """Evaluate the model and calculate metrics."""
    model.eval()
    total_ade = 0.0
    predictions_dict = {}
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            # Move data to device
            images = batch['images'].to(device)
            past_trajectory = batch['past_trajectory'].to(device)
            intent = batch['intent'].to(device)
            
            # Get predictions
            predictions = model(images, past_trajectory, intent)
            
            # Calculate ADE if ground truth is available
            if 'future_trajectory' in batch:
                future_trajectory = batch['future_trajectory'].to(device)
                batch_ade = torch.norm(predictions - future_trajectory, dim=2).mean().item()
                total_ade += batch_ade
            
            # Store predictions for submission
            for i, frame_name in enumerate(batch['frame_name']):
                pred_traj = predictions[i].cpu().numpy()
                predictions_dict[frame_name] = {
                    'pos_x': pred_traj[:, 0],
                    'pos_y': pred_traj[:, 1]
                }
    
    # Calculate average ADE if ground truth was available
    if 'future_trajectory' in next(iter(test_loader)):
        avg_ade = total_ade / len(test_loader)
        print(f"Average ADE: {avg_ade:.4f}")
    
    return predictions_dict


In [10]:
def create_submission(predictions_dict, submission_file_base, authors, affiliation, account_name, method_name):
    """Create submission file in the required format."""
    # Create output directory if it doesn't exist
    if not os.path.exists(submission_file_base):
        os.makedirs(submission_file_base)
    
    # Create frames with trajectory predictions
    frame_predictions = []
    for frame_name, pred in predictions_dict.items():
        traj_pred = wod_e2ed_submission_pb2.TrajectoryPrediction(
            pos_x=pred['pos_x'].tolist(),
            pos_y=pred['pos_y'].tolist()
        )
        
        frame_traj = wod_e2ed_submission_pb2.FrameTrajectoryPredictions(
            frame_name=frame_name,
            trajectory=traj_pred
        )
        
        frame_predictions.append(frame_traj)
    
    # Create submission proto
    submission = wod_e2ed_submission_pb2.E2EDChallengeSubmission(
        predictions=frame_predictions,
        submission_type=wod_e2ed_submission_pb2.E2EDChallengeSubmission.SubmissionType.E2ED_SUBMISSION,
        authors=authors,
        affiliation=affiliation,
        account_name=account_name,
        unique_method_name=method_name,
        method_link="https://github.com/yourusername/waymo-e2e-driving",
        description="End-to-end vision-based autonomous driving model using multi-camera perception",
        uses_public_model_pretraining=True,
        public_model_names=["ResNet18"],
        num_model_parameters="25M"
    )
    
    # Write submission to file
    submission_file = os.path.join(submission_file_base, 'submission.pb')
    with tf.io.gfile.GFile(submission_file, 'wb') as fp:
        fp.write(submission.SerializeToString())
    
    print(f"Submission file created at {submission_file}")
    
    # Create tar.gz (this would need additional shell commands in practice)
    print("To create tar.gz file, run the following commands:")
    print(f"cd {os.path.dirname(submission_file_base)}")
    print(f"tar cvf {os.path.basename(submission_file_base)}.tar {os.path.basename(submission_file_base)}")
    print(f"gzip {os.path.basename(submission_file_base)}.tar")


In [14]:
os.environ['CURL_CA_BUNDLE'] = '/home/aaylen/Documents/Waymo-Challenge/cacert.pem'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/aaylen/Documents/Waymo-Challenge/token.json'

In [None]:

def main():
    # Data transforms
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((IMAGE_HEIGHT, IMAGE_WIDTH)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Dataset paths - replace with actual paths
    DATASET_FOLDER = 'gs://waymo_open_dataset_end_to_end_camera_v_1_0_0'
    TRAIN_FILES = tf.io.matching_files(os.path.join(DATASET_FOLDER, '*.tfrecord-*'))
    VALIDATION_FILES = tf.io.matching_files(os.path.join(DATASET_FOLDER, '*.tfrecord-*'))
    TEST_FILES = tf.io.matching_files(os.path.join(DATASET_FOLDER, '*.tfrecord-*'))

    print(f"Training files: {TRAIN_FILES}")
    # Create datasets
    train_dataset = WaymoE2EDataset(TRAIN_FILES.numpy(), transform=transform)
    val_dataset = WaymoE2EDataset(VALIDATION_FILES.numpy(), transform=transform)
    test_dataset = WaymoE2EDataset(TEST_FILES.numpy(), transform=transform)

    print(train_dataset[0])
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
    
    # Initialize model
    model = E2EDrivingModel().to(device)
    
    # Loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    # Train model
    model = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        criterion=criterion,
        optimizer=optimizer,
        num_epochs=NUM_EPOCHS,
        device=device
    )
    
    # Load best model for evaluation
    model.load_state_dict(torch.load('best_e2e_driving_model.pth'))
    
    # Evaluate model and get predictions
    predictions_dict = evaluate_model(model, test_loader, device)
    
    # Create submission
    create_submission(
        predictions_dict=predictions_dict,
        submission_file_base='/tmp/WaymoE2ESubmission',
        authors=['Your Name'],
        affiliation='Your Organization',
        account_name='your.email@example.com',
        method_name='VisionE2EDriving'
    )

In [22]:
if __name__ == "__main__":
    main()

gs://waymo_open_dataset_end_to_end_camera_v_1_0_0/*.tfrecord-*
