In [None]:
# (Greedy forward selection + DE-based weight tuning on val + Test set AUROC eval only)
import os
import numpy as np
import pandas as pd
from PIL import Image

import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from scipy.optimize import differential_evolution
import timm

# ------------------------
# 1. Define Disease Classes
# ------------------------
CLASSES = [
    "No Finding", "Atelectasis", "Cardiomegaly", "Effusion", "Infiltration",
    "Mass", "Nodule", "Pneumonia", "Pneumothorax", "Consolidation",
    "Edema", "Emphysema", "Fibrosis", "Pleural_Thickening", "Hernia"
]

# ------------------------
# 2. Load and Preprocess Data
# ------------------------
df = pd.read_csv("/student/csc490_project/shared/labels.csv")
df["label_list"] = df["Finding Labels"].apply(lambda x: x.split("|"))

mlb = MultiLabelBinarizer(classes=CLASSES)
labels_array = mlb.fit_transform(df["label_list"])
df["labels"] = list(labels_array)

unique_patients = df["Patient ID"].unique()
np.random.seed(42)
np.random.shuffle(unique_patients)

train_end = int(0.7 * len(unique_patients))
val_end = int(0.8 * len(unique_patients))

train_patients = unique_patients[:train_end]
val_patients = unique_patients[train_end:val_end]
test_patients = unique_patients[val_end:]

val_df = df[df["Patient ID"].isin(val_patients)].reset_index(drop=True)
test_df = df[df["Patient ID"].isin(test_patients)].reset_index(drop=True)

# ------------------------
# 3. Dataset Class
# ------------------------
class ChestXrayDataset(Dataset):
    """
    Dataset class for loading chest X-ray images and corresponding multilabel disease annotations.

    Attributes:
        df (pd.DataFrame): DataFrame with image filenames and labels.
        root_dir (str): Directory containing image files.
        transform (callable): Image transformations to apply.
    """
    def __init__(self, df, root_dir, transform=None):
        self.df = df
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.df.iloc[idx]["Image Index"])
        image = Image.open(img_path).convert("L")
        labels = torch.tensor(self.df.iloc[idx]["labels"], dtype=torch.float)
        if self.transform:
            image = self.transform(image)
        return image, labels

# ------------------------
# 4. Transforms and DataLoaders
# ------------------------
transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

img_dir = "/student/csc490_project/shared/preprocessed_images/preprocessed_images"
val_loader = DataLoader(ChestXrayDataset(val_df, img_dir, transform), batch_size=16, shuffle=False, num_workers=4)
test_loader = DataLoader(ChestXrayDataset(test_df, img_dir, transform), batch_size=16, shuffle=False, num_workers=4)

# ------------------------
# 5. Load Pretrained Models
# ------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

models = {
    'maxvit': timm.create_model('maxvit_rmlp_base_rw_224.sw_in12k_ft_in1k', pretrained=False, num_classes=15),
    'densenet': timm.create_model('densenet121', pretrained=False, num_classes=15),
    'coatnet': timm.create_model('coatnet_2_rw_224.sw_in12k_ft_in1k', pretrained=False, num_classes=15),
    'swin': timm.create_model('swin_large_patch4_window7_224', pretrained=False, num_classes=15),
    'convnext': timm.create_model('convnext_large.fb_in22k', pretrained=False, num_classes=15),
    'vgg19': timm.create_model('vgg19.tv_in1k', pretrained=False, num_classes=15)
}

models['maxvit'].load_state_dict(torch.load('/student/csc490_project/shared/new_split_models/no_augment_maxvit_rmlp_base_rw_224.sw_in12k_ft_in1k_model.pth'))
models['densenet'].load_state_dict(torch.load('/student/csc490_project/shared/new_split_models/no_augment_densenet121_model.pth'))
models['coatnet'].load_state_dict(torch.load('/student/csc490_project/shared/new_split_models/no_augment_coatnet_2_rw_224.sw_in12k_ft_in1k_model.pth'))
models['swin'].load_state_dict(torch.load('/student/csc490_project/shared/new_split_models/no_augment_swin_large_patch4_window7_224_model.pth'))
models['convnext'].load_state_dict(torch.load('/student/csc490_project/shared/new_split_models/no_augment_convnext_large.fb_in22k_model.pth'))
models['vgg19'].load_state_dict(torch.load('/student/csc490_project/shared/training/vgg19.tv_in1k.pt'))

for model in models.values():
    model.to(device)
    model.eval()

# ------------------------
# 6. Collect Predictions
# ------------------------
def collect_predictions(loader, models, device):
    """
    Runs inference using multiple models on the input DataLoader.

    Args:
        loader (DataLoader): DataLoader containing input images and labels.
        models (dict): Dictionary of model name to model instance.
        device (torch.device): Device to run inference on.

    Returns:
        tuple: Dictionary of model predictions and array of ground-truth labels.
    """
    all_preds = {name: [] for name in models}
    all_labels = []
    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            labels = labels.to(device)
            for name, model in models.items():
                all_preds[name].append(torch.sigmoid(model(images)).cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    all_preds = {k: np.concatenate(v) for k, v in all_preds.items()}
    all_labels = np.concatenate(all_labels)
    return all_preds, all_labels

# ------------------------
# 7. Weighted AUROC Objective
# ------------------------
def weighted_ensemble_auroc(weights, preds_list, labels):
    """
    Objective function for differential evolution: negative average AUROC.

    Args:
        weights (np.ndarray): Ensemble weights to apply to predictions.
        preds_list (list): List of prediction arrays from different models.
        labels (np.ndarray): Ground-truth labels.

    Returns:
        float: Negative of mean AUROC score across all classes.
    """
    weights = np.maximum(weights, 0)
    weights /= np.sum(weights) + 1e-8
    ensemble_preds = sum(w * p for w, p in zip(weights, preds_list))
    return -np.mean([
        roc_auc_score(labels[:, i], ensemble_preds[:, i]) for i in range(labels.shape[1])
    ])

# ------------------------
# 8. DE Weight Optimization
# ------------------------
def optimize_weights_subset(preds_dict, labels, model_subset):
    """
    Optimizes ensemble weights for a subset of models using Differential Evolution.

    Args:
        preds_dict (dict): Dictionary of model name to prediction arrays.
        labels (np.ndarray): Ground-truth labels.
        model_subset (list): List of model names to include in ensemble.

    Returns:
        tuple: Optimized weights, mean AUROC, final predictions, and per-class AUROC list.
    """
    preds_list = [preds_dict[name] for name in model_subset]
    bounds = [(0, 1)] * len(preds_list)
    result = differential_evolution(weighted_ensemble_auroc, bounds, args=(preds_list, labels), maxiter=50, tol=1e-5)
    best_weights = result.x / np.sum(result.x)
    final_preds = sum(w * p for w, p in zip(best_weights, preds_list))
    per_class_aurocs = [roc_auc_score(labels[:, i], final_preds[:, i]) for i in range(labels.shape[1])]
    final_score = np.mean(per_class_aurocs)
    return best_weights, final_score, final_preds, per_class_aurocs

# ------------------------
# 9. Greedy Forward Model Selection
# ------------------------
def greedy_forward_selection(preds_dict, labels):
    """
    Performs greedy forward selection of models based on validation AUROC.

    Args:
        preds_dict (dict): Dictionary of model predictions.
        labels (np.ndarray): Ground-truth labels.

    Returns:
        tuple: List of selected model names and best AUROC score.
    """
    model_names = list(preds_dict.keys())
    remaining_models = set(model_names)
    selected_models = []
    best_score = -np.inf

    while remaining_models:
        best_model = None
        best_score_candidate = -np.inf

        for model in remaining_models:
            current_combo = selected_models + [model]
            weights, score, _, _ = optimize_weights_subset(preds_dict, labels, current_combo)
            if score > best_score_candidate:
                best_score_candidate = score
                best_model = model

        if best_score_candidate > best_score:
            selected_models.append(best_model)
            remaining_models.remove(best_model)
            best_score = best_score_candidate
            print(f"Added: {best_model} | Val AUROC: {best_score:.4f}")
        else:
            break

    return selected_models, best_score

# ------------------------
# 10. Ensemble Optimization and Final Evaluation
# ------------------------
val_preds, val_labels = collect_predictions(val_loader, models, device)
selected_models, _ = greedy_forward_selection(val_preds, val_labels)
final_weights, _, _, _ = optimize_weights_subset(val_preds, val_labels, selected_models)

test_preds, test_labels = collect_predictions(test_loader, models, device)
ensemble_test_preds = sum(w * test_preds[name] for w, name in zip(final_weights, selected_models))

per_class_aurocs = [roc_auc_score(test_labels[:, i], ensemble_test_preds[:, i]) for i in range(test_labels.shape[1])]
mean_test_auroc = np.mean(per_class_aurocs)

# ------------------------
# 11. Display Results
# ------------------------
print("\nFinal Test Ensemble Evaluation:")
print(f"Selected Models: {selected_models}")
print("Optimized Weights:")
for name, weight in zip(selected_models, final_weights):
    print(f"{name}: {weight:.4f}")

print(f"\nMean Test AUROC: {mean_test_auroc:.4f}")
print("\nPer-Class Test AUROC:")
for cls, auc in zip(CLASSES, per_class_aurocs):
    print(f"{cls}: {auc:.4f}")

Added: coatnet | Val AUROC: 0.8278
Added: convnext | Val AUROC: 0.8368
Added: densenet | Val AUROC: 0.8410
Added: maxvit | Val AUROC: 0.8429
Added: vgg19 | Val AUROC: 0.8438
Added: swin | Val AUROC: 0.8444

Final Test Ensemble Evaluation:
Selected Models: ['coatnet', 'convnext', 'densenet', 'maxvit', 'vgg19', 'swin']
Optimized Weights:
coatnet: 0.1524
convnext: 0.1877
densenet: 0.2052
maxvit: 0.1663
vgg19: 0.1138
swin: 0.1747

Mean Test AUROC: 0.8563

Per-Class Test AUROC:
No Finding: 0.8017
Atelectasis: 0.8355
Cardiomegaly: 0.9153
Effusion: 0.8950
Infiltration: 0.7376
Mass: 0.8820
Nodule: 0.8097
Pneumonia: 0.7871
Pneumothorax: 0.8963
Consolidation: 0.8187
Edema: 0.9156
Emphysema: 0.9449
Fibrosis: 0.8479
Pleural_Thickening: 0.8424
Hernia: 0.9146


In [None]:
# ------------------------
# 12. Test-Time Augmentation Evaluation
# ------------------------

import torch
from torch.utils.data import DataLoader
from torchvision import transforms
import numpy as np
from sklearn.metrics import roc_auc_score

# Define TTA transforms: each transform applies a different augmentation strategy during test time
tta_transforms = [
    transforms.Compose([  # Horizontal flip
        transforms.Grayscale(num_output_channels=3),
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(p=1.0),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    transforms.Compose([  # Brightness and contrast jitter
        transforms.Grayscale(num_output_channels=3),
        transforms.Resize((224, 224)),
        transforms.ColorJitter(brightness=0.1, contrast=0.1),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    transforms.Compose([  # Slight translation
        transforms.Grayscale(num_output_channels=3),
        transforms.Resize((224, 224)),
        transforms.RandomAffine(degrees=0, translate=(0.02, 0.02)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    transform  # Original (no augmentation)
]

def collect_tta_predictions(df, root_dir, models, device, tta_transforms, batch_size=16):
    """
    Collects model predictions under various test-time augmentations and averages them.

    Args:
        df (pd.DataFrame): DataFrame containing test image metadata.
        root_dir (str): Directory where images are stored.
        models (dict): Dictionary mapping model names to PyTorch model instances.
        device (torch.device): Device to perform inference on (e.g., "cuda" or "cpu").
        tta_transforms (list): List of torchvision.transforms.Compose instances for TTA.
        batch_size (int, optional): Batch size for DataLoader. Defaults to 16.

    Returns:
        dict: Averaged predictions per model across all TTAs.
        np.ndarray: Ground-truth labels (assumed constant across TTAs).
    """
    # Initialize storage for predictions and labels
    all_preds = {name: [] for name in models}
    all_labels_tta = []

    # Run predictions across all TTA transforms
    for tform in tta_transforms:
        loader = DataLoader(
            ChestXrayDataset(df, root_dir, tform),
            batch_size=batch_size,
            shuffle=False,
            num_workers=4
        )

        preds_this_tta = {name: [] for name in models}
        labels_this_tta = []

        # Collect predictions for this TTA setting
        with torch.no_grad():
            for images, labels in loader:
                images = images.to(device)
                labels = labels.to(device)

                for name, model in models.items():
                    outputs = model(images)
                    preds = torch.sigmoid(outputs)  # Apply sigmoid for multi-label probabilities
                    preds_this_tta[name].append(preds.cpu().numpy())

                labels_this_tta.append(labels.cpu().numpy())

        # Concatenate and store predictions
        for name in models:
            concatenated_preds = np.concatenate(preds_this_tta[name], axis=0)
            all_preds[name].append(concatenated_preds)

        # Store labels (assumed identical for all TTAs)
        concatenated_labels = np.concatenate(labels_this_tta, axis=0)
        all_labels_tta.append(concatenated_labels)

    # Sanity check: ensure label consistency across all TTA transforms
    for i in range(1, len(all_labels_tta)):
        if all_labels_tta[i].shape != all_labels_tta[0].shape or not np.array_equal(all_labels_tta[i], all_labels_tta[0]):
            raise ValueError("Label mismatch across TTAs — check consistency of test DataLoader.")

    all_labels = all_labels_tta[0]  # Use first as ground truth

    # Average predictions across TTAs for each model
    all_preds_avg = {}
    for name in models:
        stacked_preds = np.stack(all_preds[name], axis=0)  # Shape: (num_tta, num_samples, num_classes)
        all_preds_avg[name] = np.mean(stacked_preds, axis=0)  # Shape: (num_samples, num_classes)

    return all_preds_avg, all_labels

# Run Test-Time Augmentation on test set
test_preds_tta, test_labels_tta = collect_tta_predictions(test_df, img_dir, models, device, tta_transforms)

# Perform ensemble prediction using DE-tuned weights
ensemble_test_preds_tta = sum(w * test_preds_tta[name] for w, name in zip(final_weights, selected_models))

# Compute per-class AUROC scores with TTA
per_class_aurocs_tta = [
    roc_auc_score(test_labels_tta[:, i], ensemble_test_preds_tta[:, i])
    for i in range(test_labels_tta.shape[1])
]
mean_test_auroc_tta = np.mean(per_class_aurocs_tta)

# Print final TTA results
print("\nFinal Test Ensemble Evaluation with TTA:")
print(f"Selected Models: {selected_models}")
print("Optimized Weights:")
for name, weight in zip(selected_models, final_weights):
    print(f"{name}: {weight:.4f}")
print(f"\nMean Test AUROC (TTA): {mean_test_auroc_tta:.4f}")
print("\nPer-Class Test AUROC (TTA):")
for cls, auc in zip(CLASSES, per_class_aurocs_tta):
    print(f"{cls}: {auc:.4f}")




Final Test Ensemble Evaluation with TTA:
Selected Models: ['coatnet', 'convnext', 'densenet', 'maxvit', 'vgg19', 'swin']
Optimized Weights:
coatnet: 0.1524
convnext: 0.1877
densenet: 0.2052
maxvit: 0.1663
vgg19: 0.1138
swin: 0.1747

Mean Test AUROC (TTA): 0.8577

Per-Class Test AUROC (TTA):
No Finding: 0.8021
Atelectasis: 0.8372
Cardiomegaly: 0.9170
Effusion: 0.8953
Infiltration: 0.7376
Mass: 0.8829
Nodule: 0.8108
Pneumonia: 0.7873
Pneumothorax: 0.8971
Consolidation: 0.8193
Edema: 0.9158
Emphysema: 0.9454
Fibrosis: 0.8473
Pleural_Thickening: 0.8443
Hernia: 0.9257
