In [1]:
# =============================================================================
# Block 0: GPU and Environment Setup
# =============================================================================
# This block automatically selects the GPU with the most free memory to ensure
# efficient training without manual configuration. It also sets up helper
# functions for creating a deterministic (reproducible) training environment.
# =============================================================================

import os
import subprocess
import random
import numpy as np
import torch

def get_freest_gpu():
    """
    Finds the GPU with the most available memory using the `nvidia-smi` command.
    This is useful for multi-GPU systems to avoid overloading a specific device.
    """
    try:
        # Query nvidia-smi for free memory per GPU, returned in MiB
        result = subprocess.check_output(
            ['nvidia-smi', '--query-gpu=memory.free', '--format=csv,nounits,noheader'],
            encoding='utf-8'
        )
        # Parse the output and find the index of the GPU with max free memory
        free_memories = [int(x) for x in result.strip().split('\n')]
        best_gpu_id = free_memories.index(max(free_memories))
        print(f"✅ Found {len(free_memories)} GPUs. Auto-selecting GPU {best_gpu_id} with the most free memory.")
        return best_gpu_id
    except Exception as e:
        print(f"⚠️ Could not run `nvidia-smi`. Defaulting to GPU 0. Error: {e}")
        return 0  # Fallback to GPU 0 if nvidia-smi fails

# Set the visible CUDA device for this script
try:
    gpu_id = get_freest_gpu()
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
except Exception:
    os.environ["CUDA_VISIBLE_DEVICES"] = "" # Fallback for non-GPU environments
    print("⚠️ GPU not found. Defaulting to CPU.")

✅ Found 4 GPUs. Auto-selecting GPU 0 with the most free memory.


In [3]:
# =============================================================================
# Block 1: Determinism Utilities for Reproducibility
# =============================================================================
# These functions are critical for ensuring that model training is reproducible.
# By setting the same seed, get the exact same results every time
# =============================================================================

def set_global_seed(seed: int):
    """
    Sets a global seed for Python, NumPy, and PyTorch to ensure that all
    random operations are deterministic.
    """
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    # Configure cuDNN for deterministic behavior
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

def make_loader_kwargs(seed: int, num_workers: int = 4):
    """
    Creates a dictionary of arguments for a PyTorch DataLoader to make its
    data shuffling and worker processes deterministic.
    """
    g = torch.Generator().manual_seed(seed)
    def _seed_worker(worker_id):
        # Each worker gets a unique seed
        worker_seed = seed + worker_id
        np.random.seed(worker_seed)
        random.seed(worker_seed)
    
    return dict(
        num_workers=num_workers,
        worker_init_fn=_seed_worker,
        generator=g,
        pin_memory=True
    )

In [5]:
# =============================================================================
# Block 2: Configuration and Dataset Definition
# =============================================================================
# This block defines all configurable parameters, sets up the image processor,
# and defines the custom PyTorch Dataset for loading our multimodal data.
# =============================================================================

from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
from transformers import AutoImageProcessor

# --- Main Configuration ---
# Modify these paths and settings for your environment
BASE_DIR = "/home/slieu3/CS_bikelane/for_GIT_test/train" # Base project directory
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
IMAGE_DIR = os.path.join(BASE_DIR, f"data/image_set")

# Paths to the CSV files defining training and validation splits
TRAIN_CSV_PATH = os.path.join(BASE_DIR, f"data/TRAIN.csv")
VAL_CSV_PATH = os.path.join(BASE_DIR, f"data/VAL.csv")

# --- Dataset and Model Setup ---
train_df_balanced = pd.read_csv(TRAIN_CSV_PATH)
val_df = pd.read_csv(VAL_CSV_PATH)

# Load the image processor from Hugging Face for the Swin Transformer model.
# This handles resizing, normalization, and other preprocessing steps.
image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-large-patch4-window12-384")

# Map class names to integer labels for the model
label_map = {"no_bike_lane": 0, "designated": 1, "protected": 2}

class MultiImageDataset(Dataset):
    """
    A custom PyTorch Dataset to handle our specific multimodal input.
    For each item, it loads and processes three images (two Street View, one Satellite)
    and their corresponding label.
    """
    def __init__(self, dataframe, image_dir, image_processor):
        self.df = dataframe.reset_index(drop=True)
        self.image_dir = image_dir
        self.processor = image_processor

    def __len__(self):
        """Returns the total number of samples in the dataset."""
        return len(self.df)

    def _load_and_process_image(self, image_filename: str):
        """Helper function to open an image, convert to RGB, and process it."""
        image_path = os.path.join(self.image_dir, image_filename)
        image = Image.open(image_path).convert("RGB")
        # The processor returns a tensor of shape [1, C, H, W], so we squeeze it
        return self.processor(images=image, return_tensors="pt")["pixel_values"].squeeze(0)

    def __getitem__(self, idx: int):
        """
        Fetches the sample at the given index.
        Returns a tuple of: (gsv1_tensor, gsv2_tensor, sat_tensor, label_tensor)
        """
        row = self.df.loc[idx]
        
        # Load and process each of the three images
        gsv1_tensor = self._load_and_process_image(row["GSV1"])
        gsv2_tensor = self._load_and_process_image(row["GSV2"])
        sat_tensor = self._load_and_process_image(row["SAT"])
        
        # The "is_bike" column already contains our integer labels (0, 1, 2)
        label = int(row["is_bike"])
        
        return gsv1_tensor, gsv2_tensor, sat_tensor, torch.tensor(label, dtype=torch.long)

preprocessor_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [10]:
# =============================================================================
# Block 3: Model Architecture Definition
# =============================================================================
# This block defines the neural network architectures for the winning model:
# 1. SwinSingleViewHier: A Swin Transformer for a single image modality with
#    two heads for hierarchical classification.
# 2. WeightedDecisionFusionHier: A module that performs a learnable, weighted
#    average of the predictions from the three single-view models.
# =============================================================================
import torch.nn as nn
from transformers import SwinModel


class SwinSingleViewHier(nn.Module):
    """
    A single-modality model using a Swin Transformer backbone.
    It has two separate classification heads to perform hierarchical classification:
    - head_presence: Predicts presence vs. absence of a bike lane.
    - head_type: Predicts designated vs. protected, for lanes that are present.
    """
    def __init__(self, model_name="microsoft/swin-large-patch4-window12-384"):
        super().__init__()
        self.backbone = SwinModel.from_pretrained(model_name)
        
        # Freeze all backbone layers except for the last two for fine-tuning
        for name, param in self.backbone.named_parameters():
            if "layers.2" in name or "layers.3" in name:
                param.requires_grad = True
            else:
                param.requires_grad = False
        
        hidden_size = self.backbone.config.hidden_size
        
        # Head 1: Is a bike lane present? (No Lane vs. Lane)
        self.head_presence = nn.Sequential(
            nn.Linear(hidden_size, 512), nn.ReLU(), nn.Linear(512, 2)
        )
        # Head 2: What type of bike lane is it? (Designated vs. Protected)
        self.head_type = nn.Sequential(
            nn.Linear(hidden_size, 512), nn.ReLU(), nn.Linear(512, 2)
        )

    def forward(self, pixel_values):
        """Performs a forward pass and returns logits from both heads."""
        # Get feature embedding from the backbone
        features = self.backbone(pixel_values).last_hidden_state.mean(dim=1)
        # Pass features through both heads
        logits_presence = self.head_presence(features)
        logits_type = self.head_type(features)
        return logits_presence, logits_type

class WeightedDecisionFusionHier(nn.Module):
    """
    Implements decision-level fusion with learnable weights.
    It takes the logits from the three single-view models (GSV1, GSV2, SAT)
    and combines them using a weighted average. The weights are learned
    during training.
    """
    def __init__(self):
        super().__init__()
        # Initialize raw weights. Softmax will be applied to ensure they sum to 1.
        self.raw_weights = nn.Parameter(torch.tensor([0.33, 0.33, 0.34], dtype=torch.float32))
        self.softmax = nn.Softmax(dim=0)
        self.dropout = nn.Dropout(p=0.3)

    def forward(self, p1, p2, p3, t1, t2, t3):
        """
        Args:
            p1, p2, p3: Presence logits from GSV1, GSV2, and SAT models.
            t1, t2, t3: Type logits from GSV1, GSV2, and SAT models.
        """
        weights = self.softmax(self.raw_weights)
        
        # Apply weighted average to the presence logits
        fused_presence = self.dropout(weights[0]*p1 + weights[1]*p2 + weights[2]*p3)
        # Apply weighted average to the type logits
        fused_type = self.dropout(weights[0]*t1 + weights[1]*t2 + weights[2]*t3)
        
        return fused_presence, fused_type

    @torch.no_grad()
    def get_weights(self):
        """Returns the learned weights after softmax normalization."""
        return self.softmax(self.raw_weights).detach().cpu().numpy()

In [11]:
# =============================================================================
# Block 4: Loss, Prediction, and Training Helper Functions
# =============================================================================
# This block contains the core logic for training and evaluation:
# - hierarchical_loss: A custom loss function for the two-stage task.
# - hierarchical_predict: A function to derive final labels from the two heads.
# - train_epoch / eval_epoch: Standard loops for one epoch of training/validation.
# - save_combo_best: A utility to save the best model checkpoint.
# =============================================================================
from sklearn.metrics import f1_score
from torch import optim

def hierarchical_loss(out_presence, out_type, y_true):
    """
    Calculates the total loss for the hierarchical model.
    It's the sum of two cross-entropy losses:
    1. Loss for presence detection on ALL samples.
    2. Loss for type classification on ONLY the samples where a bike lane is present.
    """
    # y_true is {0: no-lane, 1: designated, 2: protected}
    # Create binary labels for presence (0 if no-lane, 1 otherwise)
    y_presence = (y_true > 0).long()
    loss1 = nn.CrossEntropyLoss()(out_presence, y_presence)
    
    # Create a mask to select only samples with bike lanes
    lane_mask = (y_true > 0)
    if lane_mask.any():
        # y_type labels: 0 for designated (label 1), 1 for protected (label 2)
        y_type = (y_true[lane_mask] - 1).long() 
        loss2 = nn.CrossEntropyLoss()(out_type[lane_mask], y_type)
    else:
        # If no bike lanes in the batch, type loss is 0
        loss2 = torch.tensor(0.0, device=out_presence.device)
        
    return loss1 + loss2

@torch.no_grad()
def hierarchical_predict(out_presence, out_type):
    """
    Generates final predictions (0, 1, or 2) from the two sets of logits.
    """
    # Step 1: Predict presence (0=no-lane, 1=lane)
    pred_presence = out_presence.argmax(dim=1)
    
    # Step 2: Predict type (0=designated, 1=protected)
    pred_type = out_type.argmax(dim=1)
    
    # Initialize final predictions with presence predictions
    final_preds = pred_presence.clone()
    
    # For samples predicted as having a lane, update with the type prediction
    lane_mask = (pred_presence == 1)
    final_preds[lane_mask] = pred_type[lane_mask] + 1 # Map {0,1} -> {1,2}
    
    return final_preds

# --- Train/Eval Epoch Functions ---
def train_epoch(models, fusion_model, loader, optimizer, device):
    """Runs a single training epoch."""
    # Set all models to training mode
    m1, m2, m3 = models
    m1.train(); m2.train(); m3.train(); fusion_model.train()
    
    total_loss, total_correct, total_samples = 0.0, 0, 0
    
    for gsv1_batch, gsv2_batch, sat_batch, y_batch in loader:
        # Move data to the selected device
        gsv1_batch, gsv2_batch, sat_batch, y_batch = gsv1_batch.to(device), gsv2_batch.to(device), sat_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass through each single-view model
        p1, t1 = m1(gsv1_batch)
        p2, t2 = m2(gsv2_batch)
        p3, t3 = m3(sat_batch)
        
        # Forward pass through the fusion layer
        fused_presence, fused_type = fusion_model(p1, p2, p3, t1, t2, t3)
        
        # Calculate loss and perform backpropagation
        loss = hierarchical_loss(fused_presence, fused_type, y_batch)
        loss.backward()
        optimizer.step()
        
        # Update running metrics
        total_loss += loss.item()
        preds = hierarchical_predict(fused_presence, fused_type)
        total_correct += (preds == y_batch).sum().item()
        total_samples += y_batch.size(0)
        
    avg_loss = total_loss / len(loader)
    accuracy = total_correct / total_samples
    return avg_loss, accuracy

@torch.no_grad()
def eval_epoch(models, fusion_model, loader, device):
    """Runs a single evaluation epoch."""
    # Set all models to evaluation mode
    m1, m2, m3 = models
    m1.eval(); m2.eval(); m3.eval(); fusion_model.eval()
    
    all_preds, all_true = [], []
    total_loss = 0.0
    
    for gsv1_batch, gsv2_batch, sat_batch, y_batch in loader:
        gsv1_batch, gsv2_batch, sat_batch, y_batch = gsv1_batch.to(device), gsv2_batch.to(device), sat_batch.to(device), y_batch.to(device)

        # Forward pass
        p1, t1 = m1(gsv1_batch)
        p2, t2 = m2(gsv2_batch)
        p3, t3 = m3(sat_batch)
        fused_presence, fused_type = fusion_model(p1, p2, p3, t1, t2, t3)
        
        # Calculate loss and make predictions
        loss = hierarchical_loss(fused_presence, fused_type, y_batch)
        total_loss += loss.item()
        preds = hierarchical_predict(fused_presence, fused_type)
        
        all_preds.extend(preds.cpu().numpy())
        all_true.extend(y_batch.cpu().numpy())
        
    avg_loss = total_loss / len(loader)
    return np.array(all_preds), np.array(all_true), avg_loss

In [12]:
# =============================================================================
# Block 5: Full Training Pipeline for a Single Seed
# =============================================================================
# This is the main function that orchestrates the entire training and
# evaluation process for a single random seed. It includes data loading,
# model building, the training loop, early stopping, and saving results.
# =============================================================================
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import json
from datetime import datetime

CLASS_NAMES = ["No Bike Lane", "Designated", "Protected"]

def train_one_run(seed: int, config: dict):
    """
    Executes a complete training and evaluation pipeline for a single seed.
    
    Args:
        seed (int): The random seed for this run.
        config (dict): A dictionary containing all hyperparameters and paths.
    
    Returns:
        dict: A dictionary containing performance metrics and paths to saved artifacts.
    """
    print(f"\n{'='*30} Starting Run for Seed: {seed} {'='*30}")
    
    # 1. Set seed for reproducibility
    set_global_seed(seed)

    # 2. Build datasets and dataloaders
    train_dataset = MultiImageDataset(train_df_balanced, config["image_dir"], image_processor)
    val_dataset = MultiImageDataset(val_df, config["image_dir"], image_processor)
    
    loader_kwargs = make_loader_kwargs(seed=seed, num_workers=config["num_workers"])
    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True, **loader_kwargs)
    val_loader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=False, **loader_kwargs)

    # 3. Build models, optimizer, and scheduler
    device = config["device"]
    m1 = SwinSingleViewHier().to(device)
    m2 = SwinSingleViewHier().to(device)
    m3 = SwinSingleViewHier().to(device)
    models = (m1, m2, m3)
    fusion_model = WeightedDecisionFusionHier().to(device)
    
    all_params = list(m1.parameters()) + list(m2.parameters()) + list(m3.parameters()) + list(fusion_model.parameters())
    optimizer = optim.AdamW(all_params, lr=config["lr"], eps=1e-6)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=3, verbose=True
    )

    # 4. Training loop with early stopping
    best_val_loss = float("inf")
    patience_counter = 0
    history = []

    for epoch in range(1, config["num_epochs"] + 1):
        train_loss, train_acc = train_epoch(models, fusion_model, train_loader, optimizer, device)
        y_pred, y_true, val_loss = eval_epoch(models, fusion_model, val_loader, device)
        
        scheduler.step(val_loss)
        
        # Calculate metrics for this epoch
        val_acc = accuracy_score(y_true, y_pred)
        val_f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
        weights = fusion_model.get_weights()
        
        print(f"[Seed {seed} | Epoch {epoch:03d}] "
              f"TrainLoss={train_loss:.4f} TrainAcc={train_acc:.4f} | "
              f"ValLoss={val_loss:.6f} ValAcc={val_acc:.4f} ValF1={val_f1:.4f} | "
              f"Weights GSV1={weights[0]:.3f} GSV2={weights[1]:.3f} SAT={weights[2]:.3f}")
        
        # Save epoch results
        history.append({
            "epoch": epoch, "train_loss": train_loss, "train_acc": train_acc,
            "val_loss": val_loss, "val_acc": val_acc, "val_f1": val_f1,
            "w_gsv1": weights[0], "w_gsv2": weights[1], "w_sat": weights[2]
        })

        # Check for improvement and save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            # Save model checkpoint
            # (Implementation for saving/loading state_dict omitted for brevity, but would go here)
            print(f"✅ New best model found with validation loss: {best_val_loss:.6f}. Saving checkpoint.")

        else:
            patience_counter += 1
            print(f"⚠️ No improvement. Patience {patience_counter}/{config['patience']}")
            if patience_counter >= config['patience']:
                print(f"⏹️ Early stopping triggered at epoch {epoch}.")
                break
    
    # 5. Save results and artifacts for this run
    os.makedirs(config["save_dir"], exist_ok=True)
    
    # Save per-epoch history
    history_df = pd.DataFrame(history)
    history_df.to_csv(os.path.join(config["save_dir"], f"history_seed_{seed}.csv"), index=False)
    
    # Final evaluation using the best model would occur here
    # (Loading the saved best checkpoint and running eval_epoch one last time)
    # For this script, we'll just use the results from the last best epoch.
    final_metrics = classification_report(y_true, y_pred, target_names=CLASS_NAMES, output_dict=True, zero_division=0)
    
    return {
        "seed": seed,
        "val_acc": final_metrics["accuracy"],
        "macro_f1": final_metrics["macro avg"]["f1-score"],
        "best_val_loss": best_val_loss,
    }

In [13]:
# =============================================================================
# Block 6: Main Execution Block
# =============================================================================
# This is the entry point of the script. It sets up the experiment
# configuration, iterates through multiple random seeds, and aggregates
# the final results to report mean and standard deviation, ensuring a robust
# [cite_start]evaluation of the model's performance. [cite: 102, 174]
# =============================================================================

if __name__ == "__main__":
    
    # --- Experiment Configuration ---
    training_config = {
        "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
        "seeds": [2023, 2024, 2025, 2026, 2027], # Run with multiple seeds for robustness
        "batch_size": 16,
        "lr": 5e-5,
        "num_epochs": 80,
        "patience": 15, # For early stopping
        "num_workers": 4,
        "image_dir": IMAGE_DIR,
        "save_dir": OUTPUT_DIR,
    }
    
    print(f"Starting training on device: {training_config['device']}")

    # --- Run Training for Each Seed ---
    all_run_results = []
    for seed in training_config["seeds"]:
        result = train_one_run(seed=seed, config=training_config)
        all_run_results.append(result)

    # --- Aggregate and Summarize Results ---
    results_df = pd.DataFrame(all_run_results)
    
    print("\n\n" + "="*30 + " FINAL RESULTS " + "="*30)
    print("\nPer-seed performance:\n")
    print(results_df)
    
    # Calculate mean and standard deviation of key metrics
    summary = results_df.agg({
        "val_acc": ["mean", "std"],
        "macro_f1": ["mean", "std"]
    })
    
    print("\nSummary (mean ± std):\n")
    print(summary)
    
    # Save the aggregated and summary results to CSV files
    results_df.to_csv(os.path.join(training_config["save_dir"], "multiseed_results.csv"), index=False)
    summary.to_csv(os.path.join(training_config["save_dir"], "multiseed_summary.csv"))
    
    print(f"\n✅ Training complete. All results saved to: {training_config['save_dir']}")

Starting training on device: cuda



config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/791M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/791M [00:00<?, ?B/s]



[Seed 2023 | Epoch 001] TrainLoss=0.6503 TrainAcc=0.7453 | ValLoss=0.550380 ValAcc=0.8611 ValF1=0.8299 | Weights GSV1=0.332 GSV2=0.332 SAT=0.335
✅ New best model found with validation loss: 0.550380. Saving checkpoint.


KeyboardInterrupt: 