import os
os.environ['OPENCV_LOG_LEVEL'] = 'SILENT'
# Option 1 — Masked Contrastive (direct classification)

Direct classification using MobileNetV2 (no SSL pretraining). Keeps dataset, splits, and augmentations from the original option1 notebook.

In [None]:
# Imports & config
import os
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torchvision import models
from sklearn.metrics import roc_auc_score
import cv2
import random

# Basic hyperparams (tune as needed)
class CFG:
    img_size = 224
    batch_size = 32
    epochs = 8
    lr = 1e-4
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    subset_size = None  # set int for faster testing
cfg = CFG()
print(cfg.device)

# Specify your custom folder path here
CUSTOM_DATA_PATH = "datasets"  # Change this to your desired folder

# Create the folder if it doesn't exist
os.makedirs(CUSTOM_DATA_PATH, exist_ok=True)

In [None]:
# Load labels and image paths (same dataset used previously)
# If you're running on Kaggle, the kagglehub helper was used in the original notebooks;
# replace with your dataset path if already downloaded.
try:
    import kagglehub
    path = kagglehub.dataset_download("khanfashee/nih-chest-x-ray-14-224x224-resized")
    BASE_PATH = Path(path)
except Exception:
    BASE_PATH = Path('.')  # change to dataset root if needed

print(f'Using dataset base: {BASE_PATH}')
df = pd.read_csv(BASE_PATH / 'Data_Entry_2017.csv')
images_dir = BASE_PATH / 'images-224' / 'images-224'
df['Image Path'] = [str(images_dir / p) for p in df['Image Index'].values]

DISEASE_CATEGORIES = [
    'Atelectasis','Cardiomegaly','Effusion','Infiltration','Mass',
    'Nodule','Pneumonia','Pneumothorax','Consolidation','Edema',
    'Emphysema','Fibrosis','Pleural_Thickening','Hernia'
]
for disease in DISEASE_CATEGORIES:
    df[disease] = df['Finding Labels'].apply(lambda x: 1 if disease in x else 0)

print(f'Loaded {len(df):,} images')
print(f'Diseases: {len(DISEASE_CATEGORIES)}')

In [None]:
# Patient-level split (same logic as originals)
from sklearn.model_selection import train_test_split
unique_patients = df['Patient ID'].unique()
train_val_patients, test_patients = train_test_split(unique_patients, test_size=0.02, random_state=42)
train_patients, val_patients = train_test_split(train_val_patients, test_size=0.052, random_state=42)
train_df = df[df['Patient ID'].isin(train_patients)].copy()
val_df = df[df['Patient ID'].isin(val_patients)].copy()
test_df = df[df['Patient ID'].isin(test_patients)].copy()

if cfg.subset_size:
    train_df = train_df.head(cfg.subset_size)
    val_df = val_df.head(cfg.subset_size // 4)
    test_df = test_df.head(cfg.subset_size // 8)

print('Train/Val/Test sizes:', len(train_df), len(val_df), len(test_df))

In [None]:
# ============================================
# Step 4: Lung Segmentation (Pre-Computed)
# ============================================
# Lung segmentation moved to precompute_lung_masks.ipynb
# This notebook does not use lung masks for classification.

print("Lung segmentation handled by precompute_lung_masks.ipynb")


In [None]:
# MobileNetV2 classifier (adapt for multi-label output)
def get_mobilenet_v2(num_classes, pretrained=True):
    model = models.mobilenet_v2(pretrained=pretrained)
    in_features = model.classifier[1].in_features
    # replace classifier with sigmoid multi-label head
    model.classifier = nn.Sequential(
        nn.Dropout(0.2),
        nn.Linear(in_features, num_classes)
    )
    return model

model = get_mobilenet_v2(len(DISEASE_CATEGORIES), pretrained=True).to(cfg.device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=cfg.lr)

print(model)

In [None]:
# Training & validation loops
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for imgs, targets in loader:
        imgs = imgs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * imgs.size(0)
    return running_loss / len(loader.dataset)

@torch.no_grad()
def validate(model, loader, device):
    model.eval()
    all_targets = []
    all_preds = []
    for imgs, targets in loader:
        imgs = imgs.to(device)
        outputs = model(imgs)
        probs = torch.sigmoid(outputs).cpu().numpy()
        all_preds.append(probs)
        all_targets.append(targets.numpy())
    all_preds = np.vstack(all_preds)
    all_targets = np.vstack(all_targets)
    aucs = []
    for i in range(all_targets.shape[1]):
        try:
            aucs.append(roc_auc_score(all_targets[:, i], all_preds[:, i]))
        except Exception:
            aucs.append(np.nan)
    return np.nanmean(aucs), aucs

# Run training (small number of epochs by default)
best_auc = 0.0
for epoch in range(cfg.epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, cfg.device)
    val_auc, per_class = validate(model, val_loader, cfg.device)
    print(f'Epoch {epoch+1}/{cfg.epochs} - Train loss: {train_loss:.4f} - Val AUC: {val_auc:.4f}')
    if val_auc > best_auc:
        best_auc = val_auc
        torch.save(model.state_dict(), 'option1_mobilenetv2_best.pth')

print('Training finished. Best val AUC:', best_auc)