# Cancer Detection - High Accuracy Training (DenseNet121)

This notebook trains a DenseNet121 model on the Histopathologic Cancer Detection dataset. 
Run this in Google Colab with a GPU runtime for best results.

In [1]:
# 1. Setup Environment
!pip install pandas numpy opencv-python matplotlib tqdm scikit-learn



In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 

In [None]:
# 2. Data Setup (Google Drive)
import os
import sys
import zipfile
import shutil

# DETECT ENVIRONMENT
is_colab = 'google.colab' in sys.modules or os.path.exists('/content/sample_data')

if is_colab:
    print("⚠️ Running in Google Colab.")
    from google.colab import drive
    
    # Mount Drive
    print("Mounting Google Drive...")
    drive.mount('/content/drive')
    
    # DEFINE YOUR DRIVE PATH HERE
    # Expecting 'train.zip' and 'train_labels.csv' in this folder
    DRIVE_PATH = '/content/drive/MyDrive' 
    
    print(f"Looking for files in: {DRIVE_PATH}")
    
    drive_zip = os.path.join(DRIVE_PATH, 'train.zip')
    drive_csv = os.path.join(DRIVE_PATH, 'train_labels.csv')
    
    # Copy/Extract Zip
    if os.path.exists(drive_zip):
        if not os.path.exists('train'):
            print(f"Found {drive_zip}. Extracting to local Colab runtime...")
            with zipfile.ZipFile(drive_zip, 'r') as zip_ref:
                zip_ref.extractall('.')
            print("Extraction complete!")
        else:
            print("Train folder already exists. Skipping extraction.")
    else:
        print(f"❌ Error: 'train.zip' not found at {drive_zip}")
        print("Please check the path and filename.")

    # Copy CSV
    if os.path.exists(drive_csv):
        print(f"Found {drive_csv}. Copying...")
        shutil.copy(drive_csv, '.')
        print("CSV copied.")
    else:
        print(f"❌ Error: 'train_labels.csv' not found at {drive_csv}")

else:
    print("✅ Running Locally. Checking for files...")
    
    if os.path.exists('train'):
        print("Found 'train' folder!")
    elif os.path.exists('train_images'):
        print("Found 'train_images' folder!")
    else:
        print("Looking for 'train' or 'train_images'...")
        print(f"Files in {os.getcwd()}: {os.listdir()}")


In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
# 3. Imports
import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import matplotlib.pyplot as plt

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

In [None]:
# 4. Data Loader Class (STRICT loading)
class CancerDetectionDataset(Dataset):
    def __init__(self, image_ids, labels, image_dir, transform=None):
        self.image_ids = image_ids
        self.labels = labels
        self.image_dir = image_dir
        self.transform = transform
        
    def __len__(self):
        return len(self.image_ids)
    
    def __getitem__(self, idx):
        img_id = self.image_ids[idx]
        # Check common extensions
        possible_paths = [
            os.path.join(self.image_dir, f"{img_id}.tif"),
            os.path.join(self.image_dir, f"{img_id}.png")
        ]
        
        image = None
        for path in possible_paths:
            if os.path.exists(path):
                try:
                    image = Image.open(path).convert('RGB')
                    break
                except:
                    continue
        
        if image is None:
             pass

        if self.transform and image is not None:
            image = self.transform(image)
        
        label = self.labels[idx]
        return image, label

In [None]:
# 5. Model Architecture (ENSEMBLE: DenseNet + ResNet + EfficientNet)
class DenseNetClassifier(nn.Module):
    def __init__(self, freeze_backbone=True):
        super(DenseNetClassifier, self).__init__()
        weights = models.DenseNet121_Weights.DEFAULT
        self.backbone = models.densenet121(weights=weights)
        
        if freeze_backbone:
            for param in self.backbone.features.parameters():
                param.requires_grad = False
                
        num_features = self.backbone.classifier.in_features
        self.backbone.classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(num_features, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.4),
            nn.Linear(512, 1)
        )
        
    def forward(self, x):
        return self.backbone(x)
    
    def unfreeze_backbone(self):
        for param in self.backbone.features.parameters():
            param.requires_grad = True

class ResNetClassifier(nn.Module):
    def __init__(self, freeze_backbone=True):
        super(ResNetClassifier, self).__init__()
        weights = models.ResNet50_Weights.DEFAULT
        self.backbone = models.resnet50(weights=weights)
        
        if freeze_backbone:
            for param in self.backbone.parameters():
                param.requires_grad = False
        
        num_features = self.backbone.fc.in_features
        self.backbone.fc = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(num_features, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.4),
            nn.Linear(512, 1)
        )
        
    def forward(self, x):
        return self.backbone(x)
    
    def unfreeze_backbone(self):
        for param in self.backbone.parameters():
            param.requires_grad = True

class EfficientNetClassifier(nn.Module):
    def __init__(self, freeze_backbone=True):
        super(EfficientNetClassifier, self).__init__()
        weights = models.EfficientNet_B0_Weights.DEFAULT
        self.backbone = models.efficientnet_b0(weights=weights)
        
        if freeze_backbone:
            for param in self.backbone.features.parameters():
                param.requires_grad = False
                
        num_features = self.backbone.classifier[1].in_features
        self.backbone.classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(num_features, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.4),
            nn.Linear(512, 1)
        )
        
    def forward(self, x):
        return self.backbone(x)
    
    def unfreeze_backbone(self):
        for param in self.backbone.features.parameters():
            param.requires_grad = True


In [None]:
# 6. Training Logic (Updated for Logits)
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    all_preds = []
    all_labels = []
    
    progress_bar = tqdm(dataloader, desc="Training")
    for images, labels in progress_bar:
        if images is None: continue # Skip failed loads
        
        images = images.to(device)
        labels = labels.float().unsqueeze(1).to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * images.size(0)
        preds = (torch.sigmoid(outputs) > 0.5).float()
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        progress_bar.set_postfix({'loss': loss.item()})
    
    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = accuracy_score(all_labels, all_preds)
    return epoch_loss, epoch_acc

def validate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(dataloader, desc="Validating"):
            if images is None: continue
            images = images.to(device)
            labels = labels.float().unsqueeze(1).to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item() * images.size(0)
            preds = (torch.sigmoid(outputs) > 0.5).float()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = accuracy_score(all_labels, all_preds)
    return epoch_loss, epoch_acc

In [None]:
# 7. Configuration & Setup
import sys
import os

# DETECT ENVIRONMENT AGAIN
is_colab = 'google.colab' in sys.modules or os.path.exists('/content/sample_data')

if is_colab:
    print("Running in Google Colab.")
    IMAGE_DIR = 'train'
    CSV_PATH = 'train_labels.csv'
else:
    # Standard Local Config
    CSV_PATH = 'train_labels.csv'
    if os.path.exists('train'):
        IMAGE_DIR = 'train'
    elif os.path.exists('train_images'):
        IMAGE_DIR = 'train_images'
    else:
         IMAGE_DIR = 'train' 

print(f"Using Image Directory: {IMAGE_DIR}")
print(f"Using CSV Path: {CSV_PATH}")

BATCH_SIZE = 64
# OPTIMIZATION: Reduced dataset size to 60,000 for faster training while maintaining ~96% potential
MAX_SAMPLES = 60000 
NUM_EPOCHS = 15

transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(90),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

transform_val = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def collate_fn(batch):
    batch = list(filter(lambda x: x[0] is not None, batch))
    return torch.utils.data.dataloader.default_collate(batch)


In [None]:
# 8. Load Data & VERIFY (Stratified Sampling)
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

print("=== DEBUGGING DATA LOADING ===")
print(f"Looking for CSV at: {CSV_PATH}")
print(f"Looking for Images at: {IMAGE_DIR}")

if not os.path.exists(CSV_PATH):
    print(f"CRITICAL ERROR: {CSV_PATH} missing.")
else:
    print(f"OK: {CSV_PATH} found")

if not os.path.exists(IMAGE_DIR):
    print(f"CRITICAL ERROR: {IMAGE_DIR} missing.")
else:
    print(f"OK: {IMAGE_DIR} found")

if os.path.exists(CSV_PATH):
    df = pd.read_csv(CSV_PATH)
    
    # STRATIFIED SAMPLING to reduce size
    if MAX_SAMPLES and MAX_SAMPLES < len(df):
        print(f"Reducing dataset from {len(df)} to {MAX_SAMPLES} (Stratified)...")
        # Use simple train_test_split to get a stratified subset
        df, _ = train_test_split(df, train_size=MAX_SAMPLES, stratify=df['label'], random_state=42)
        print(f"New dataset size: {len(df)}")

    ids = df['id'].tolist()
    labels = df['label'].tolist()

    # Split Train/Val
    train_ids, test_ids, train_labels, test_labels = train_test_split(ids, labels, test_size=0.15, stratify=labels, random_state=42)
    train_ids, val_ids, train_labels, val_labels = train_test_split(train_ids, train_labels, test_size=0.15, stratify=train_labels, random_state=42)

    print("\n=== TESTING IMAGE LOADING ===")
    label_map = {0: 'No Cancer', 1: 'Cancer'}
    
    # Try to find one example of each class to verify labels are correct
    try:
        pos_idx = next(i for i, x in enumerate(train_labels) if x == 1)
        neg_idx = next(i for i, x in enumerate(train_labels) if x == 0)
        indices_to_check = [pos_idx, neg_idx]
        
        sample_ids = [train_ids[i] for i in indices_to_check]
        sample_labels = [train_labels[i] for i in indices_to_check]
    except StopIteration:
        print("Warning: Could not find examples of both classes in current split.")
        sample_ids = train_ids[:3]
        sample_labels = train_labels[:3]

    test_ds = CancerDetectionDataset(sample_ids, sample_labels, IMAGE_DIR, transform=None)
    
    try:
        for i in range(len(test_ds)):
            img, lbl = test_ds[i]
            if img is not None:
                lbl_name = label_map.get(lbl, 'Unknown')
                print(f"✓ Loaded image {sample_ids[i]} (Label: {lbl} - {lbl_name})")
            else:
                print(f"x Failed to load image {sample_ids[i]} (File not found)")
    except Exception as e:
        print(f"\n!!! FAILURE !!!")
        print(f"Error: {e}")

    # Create Loaders
    train_dataset = CancerDetectionDataset(train_ids, train_labels, IMAGE_DIR, transform=transform_train)
    val_dataset = CancerDetectionDataset(val_ids, val_labels, IMAGE_DIR, transform=transform_val)

    # Increased batch size slightly for speed if GPU allows, else keep 64
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, collate_fn=collate_fn)

    print(f"\nSUCCESS: Ready to train on {len(train_dataset)} samples")


In [None]:
# 9. Run Training (ENSEMBLE TRAINING LOOP)
models_to_train = [
    ('DenseNet', DenseNetClassifier(freeze_backbone=True)),
    ('ResNet', ResNetClassifier(freeze_backbone=True)),
    ('EfficientNet', EfficientNetClassifier(freeze_backbone=True))
]

trained_models = {}

for name, model in models_to_train:
    print(f"\n{'='*20}\nTraining {name}...\n{'='*20}")
    model = model.to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
    
    best_acc = 0.0
    best_model_path = f'best_model_{name.lower()}.pth'
    
    # Train Loop
    for epoch in range(NUM_EPOCHS):
        print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
        
        loss, acc = train_epoch(model, train_loader, criterion, optimizer, device)
        print(f"Train Loss: {loss:.4f} | Acc: {acc:.4f}")
        
        val_loss, val_acc = validate(model, val_loader, criterion, device)
        print(f"Val Loss: {val_loss:.4f} | Acc: {val_acc:.4f}")
        
        scheduler.step(val_loss)
        
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), best_model_path)
            print(f"Saved best {name} model (Acc: {best_acc:.4f})")
            
        # Fine-tuning (simplified: unfreeze halfway)
        if epoch == 5:
            print("Unfreezing backbone for fine-tuning...")
            model.unfreeze_backbone()
            for param_group in optimizer.param_groups:
                param_group['lr'] = 0.0001

    # Reload best weights
    model.load_state_dict(torch.load(best_model_path))
    trained_models[name] = model

print("\nALL MODELS TRAINED!")


In [None]:
# 10. Ensemble Evaluation
print("Evaluating Ensemble Performance...")

def evaluate_ensemble(models_dict, dataloader, device):
    for model in models_dict.values():
        model.eval()
        
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(dataloader, desc="Ensemble Validating"):
            if images is None: continue
            images = images.to(device)
            labels = labels.to(device)
            
            # Get predictions from all models
            batch_preds = []
            for name, model in models_dict.items():
                logits = model(images)
                probs = torch.sigmoid(logits)
                batch_preds.append(probs)
            
            # Average probabilities
            avg_probs = torch.mean(torch.stack(batch_preds), dim=0)
            
            preds = (avg_probs > 0.5).float()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
    acc = accuracy_score(all_labels, all_preds)
    return acc

ensemble_acc = evaluate_ensemble(trained_models, val_loader, device)
print(f"\n>>> ENSEMBLE ACCURACY: {ensemble_acc:.4f} <<<")
