##Environment Variables

In [1]:
# Install required packages
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
!pip install numpy pillow tqdm opencv-python

print("‚úÖ All packages installed successfully!")

Looking in indexes: https://download.pytorch.org/whl/cu126
‚úÖ All packages installed successfully!


##2.0 Mount Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##3.0 Imports and Configurations

In [3]:
# Environment setup for Colab
import os
import numpy as np
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
from pathlib import Path
from tqdm import tqdm

# Check environment
print("=== Environment Check ===")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(f"NumPy: {np.__version__}")

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Test basic functionality
print("\n=== Basic Tests ===")
x = torch.randn(3, 3).to(device)
print(f"Tensor on {device}: {x.shape}")

# Directories
BASE_DIR = Path("/content/drive/MyDrive/Colab Notebooks/WSI-MIL-Pipeline")
PATCHES_DIR = BASE_DIR / "data/patches"
FEATURES_DIR = BASE_DIR / "data/features"

# Ensure directories exist
PATCHES_DIR.mkdir(parents=True, exist_ok=True)
FEATURES_DIR.mkdir(parents=True, exist_ok=True)

# Device setup
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚úÖ Using device: {DEVICE}")
print(f"‚úÖ Patches directory: {PATCHES_DIR}")
print(f"‚úÖ Features directory: {FEATURES_DIR}")

=== Environment Check ===
PyTorch: 2.8.0+cu126
CUDA: False
GPU: None
NumPy: 2.0.2
Using device: cpu

=== Basic Tests ===
Tensor on cpu: torch.Size([3, 3])
‚úÖ Using device: cpu
‚úÖ Patches directory: /content/drive/MyDrive/Colab Notebooks/WSI-MIL-Pipeline/data/patches
‚úÖ Features directory: /content/drive/MyDrive/Colab Notebooks/WSI-MIL-Pipeline/data/features


##4.0 Define Feature Extractor Model (Node-Level Feature Extraction)

In [4]:
def get_feature_extractor(model_name="resnet50", pretrained=True):
    """Return a CNN feature extractor that outputs 2048-D embeddings."""
    if model_name == "resnet50":
        # Modern PyTorch way to load pretrained models
        weights = models.ResNet50_Weights.IMAGENET1K_V1 if pretrained else None
        model = models.resnet50(weights=weights)
        modules = list(model.children())[:-1]  # Remove final FC layer
        model = nn.Sequential(*modules)
        feature_dim = 2048

    elif model_name == "efficientnet_b0":
        weights = models.EfficientNet_B0_Weights.IMAGENET1K_V1 if pretrained else None
        model = models.efficientnet_b0(weights=weights)
        model.classifier = nn.Identity()
        feature_dim = 1280

    else:
        raise ValueError(f"Unsupported model: {model_name}")

    model.to(DEVICE)
    model.eval()
    return model, feature_dim

# Initialize feature extractor
feature_extractor, FEATURE_DIM = get_feature_extractor("resnet50")
print(f"‚úÖ Loaded feature extractor (dim={FEATURE_DIM})")

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 97.8M/97.8M [00:00<00:00, 192MB/s]


‚úÖ Loaded feature extractor (dim=2048)


##5.0 Preprocessing  and Feature Extraction Funtion

In [5]:
# Define transform globally so all functions can access it
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    ),
])

print("‚úÖ Image preprocessing transform defined")

# Test the transform
def test_transform():
    """Test that the transform works correctly"""
    try:
        # Create a dummy image
        dummy_img = Image.new('RGB', (100, 100), color='red')
        transformed = transform(dummy_img)
        print(f"‚úÖ Transform test passed: {transformed.shape}")
        return True
    except Exception as e:
        print(f"‚ùå Transform test failed: {e}")
        return False

test_transform()

‚úÖ Image preprocessing transform defined
‚úÖ Transform test passed: torch.Size([3, 224, 224])


True

##6.0 Feature Extraction Function

In [6]:
def extract_features_from_patch(patch_path, model):
    """Extract a feature vector from a single patch image."""
    try:
        img = Image.open(patch_path).convert("RGB")
        img_t = transform(img).unsqueeze(0).to(DEVICE)  # Now transform is accessible

        with torch.no_grad():
            features = model(img_t)
            features = features.view(-1).cpu().numpy()
        return features

    except Exception as e:
        print(f"‚ùå Failed to process {patch_path.name}: {str(e)[:100]}...")
        return None

# Test the feature extraction
def test_feature_extraction():
    """Test feature extraction on a dummy image"""
    try:
        # Create a test image
        test_img = np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8)
        test_img_pil = Image.fromarray(test_img)

        # Save temporarily
        test_path = Path("/tmp/test_patch.png")
        test_img_pil.save(test_path)

        # Extract features
        features = extract_features_from_patch(test_path, feature_extractor)

        if features is not None:
            print(f"‚úÖ Feature extraction test passed: {features.shape}")
            # Clean up
            test_path.unlink(missing_ok=True)
            return True
        else:
            print("‚ùå Feature extraction test failed")
            return False
    except Exception as e:
        print(f"‚ùå Feature extraction test error: {e}")
        return False

test_feature_extraction()

‚úÖ Feature extraction test passed: (2048,)


True

##7.0 Slide Level Loop

In [7]:
def extract_slide_features(slide_dir, output_dir):
    """Extract patch-level (node) features for a single slide."""
    slide_dir = Path(slide_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    slide_name = slide_dir.name
    out_path = output_dir / f"{slide_name}_features.npy"

    if out_path.exists():
        print(f"‚è≠Ô∏è  Skipping {slide_name} (already processed)")
        return

    patches = sorted(slide_dir.glob("*.png"))
    if not patches:
        print(f"‚ö†Ô∏è  No patches found in {slide_dir}")
        return

    print(f"üîç Extracting features for {slide_name} ({len(patches)} patches)")

    slide_features = []
    successful_patches = 0

    for patch_path in tqdm(patches, desc=f"{slide_name}", leave=False):
        feat = extract_features_from_patch(patch_path, feature_extractor)
        if feat is not None:
            slide_features.append(feat)
            successful_patches += 1

    if successful_patches > 0:
        # Save node features
        slide_features = np.array(slide_features)
        np.save(out_path, slide_features)
        print(f"‚úÖ Saved {slide_features.shape[0]}/{len(patches)} features ‚Üí {out_path.name}")
    else:
        print(f"‚ùå Failed to extract any features from {slide_name}")

##8.0 Batch Processing All Slides

In [8]:
def extract_all_slide_features(patches_root, features_root):
    """Extract features from all slides with comprehensive error handling."""
    patches_path = Path(patches_root)

    # Check if patches directory exists
    if not patches_path.exists():
        print(f"‚ùå Patches directory doesn't exist: {patches_path}")
        print("üí° Please ensure the patches directory is created and contains slide folders")
        return

    # Get all slide directories
    slide_dirs = [d for d in patches_path.iterdir() if d.is_dir()]

    if not slide_dirs:
        print(f"‚ö†Ô∏è  No slide directories found in {patches_path}")
        print("   The directory exists but is empty.")
        return

    print(f"üöÄ Found {len(slide_dirs)} slides to process")
    print("=" * 50)

    successful_slides = 0
    for slide_dir in slide_dirs:
        try:
            extract_slide_features(slide_dir, features_root)
            successful_slides += 1
        except Exception as e:
            print(f"‚ùå Error processing {slide_dir.name}: {e}")
            continue

    print("=" * 50)
    print(f"üéâ Feature extraction completed!")
    print(f"   ‚úÖ Successful: {successful_slides}/{len(slide_dirs)} slides")
    print(f"   üìÅ Features saved to: {features_root}")

# Run full extraction
extract_all_slide_features(PATCHES_DIR, FEATURES_DIR)

üöÄ Found 3 slides to process
üîç Extracting features for test_001 (1015 patches)




‚úÖ Saved 1015/1015 features ‚Üí test_001_features.npy
üîç Extracting features for test_002 (1000 patches)




‚úÖ Saved 1000/1000 features ‚Üí test_002_features.npy
üîç Extracting features for test_003 (1000 patches)


                                                             

‚úÖ Saved 1000/1000 features ‚Üí test_003_features.npy
üéâ Feature extraction completed!
   ‚úÖ Successful: 3/3 slides
   üìÅ Features saved to: /content/drive/MyDrive/Colab Notebooks/WSI-MIL-Pipeline/data/features




##9.0 final verification

In [9]:
# Final verification
print("\n" + "="*60)
print("FINAL VERIFICATION")
print("="*60)

# Check extracted features
features_files = list(Path(FEATURES_DIR).glob("*.npy"))
print(f"üìä Features extracted: {len(features_files)} files")

if features_files:
    for feat_file in features_files:
        features = np.load(feat_file)
        print(f"   üìÅ {feat_file.name}: {features.shape} (patches x features)")

    print(f"\nüéâ Pipeline is working correctly!")
    print(f"   Next: You can proceed with graph construction and GNN training.")
else:
    print("‚ùå No features extracted yet. Check the pipeline above.")

# Memory cleanup
torch.cuda.empty_cache()
print(f"üíæ GPU memory cleared: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")


FINAL VERIFICATION
üìä Features extracted: 3 files
   üìÅ test_001_features.npy: (1015, 2048) (patches x features)
   üìÅ test_002_features.npy: (1000, 2048) (patches x features)
   üìÅ test_003_features.npy: (1000, 2048) (patches x features)

üéâ Pipeline is working correctly!
   Next: You can proceed with graph construction and GNN training.
üíæ GPU memory cleared: 0.00 GB
