### Extract and Load Data

In [1]:
import h5py

import matplotlib.pyplot as plt
import numpy as np
import PIL
import tensorflow as tf
import cv2

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from skimage.exposure import match_histograms
from sklearn.decomposition import PCA

import os

import pandas as pd

2024-12-10 11:03:00.335586: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-10 11:03:00.571149: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733857380.656871    9763 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733857380.675120    9763 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-10 11:03:00.807926: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
data_path = '../Data/metastatic-tissue-classification-patchcamelyon'

print(os.listdir(data_path))

pcam_path = data_path + '/pcam'
pcam_training_file = os.path.join(pcam_path, "training_split.h5")
pcam_validation_file = os.path.join(pcam_path, "validation_split.h5")
pcam_test_file = os.path.join(pcam_path, "test_split.h5")

with h5py.File(pcam_training_file, 'r') as f:
  training_data = f['x'][:]

with h5py.File(pcam_validation_file, 'r') as f:
  val_data = f['x'][:]

with h5py.File(pcam_test_file, 'r') as f:
  test_data = f['x'][:]

# Labels
labels_path = data_path + '/Labels/Labels'
label_training_file = os.path.join(labels_path, "camelyonpatch_level_2_split_train_y.h5")
label_validation_file = os.path.join(labels_path, "camelyonpatch_level_2_split_valid_y.h5")
label_test_file = os.path.join(labels_path, "camelyonpatch_level_2_split_test_y.h5")

with h5py.File(label_training_file, 'r') as f:
  training_labels = f['y'][:]

with h5py.File(label_validation_file, 'r') as f:
  val_labels = f['y'][:]

with h5py.File(label_test_file, 'r') as f:
  test_labels = f['y'][:]

# Metadata
metadata_path = data_path + '/Metadata/Metadata/'

training_metadata = pd.read_csv(metadata_path + 'train_metadata.csv')
val_metadata = pd.read_csv(metadata_path + 'valid_metadata.csv')
test_metadata = pd.read_csv(metadata_path + 'test_metadata.csv')

reference_image = training_data[176298]

['Labels', 'Metadata', 'camelyonpatch_level_2_split_train_mask', 'pcam']


### Use Random Subset of Data

In [7]:
# Set a random seed for reproducibility
random_seed = 1

# Define the sampling fraction 
sampling_fraction = 0.01

# Randomly sample indices
num_samples = int(training_data.shape[0] * sampling_fraction)
random_indices = np.random.choice(training_data.shape[0], num_samples, replace=False)

# Sample the data and labels
training_data_sampled = training_data[random_indices]
training_labels_sampled = training_labels[random_indices]

# Print shapes to verify
print(f"Sampled Training Data Shape: {training_data_sampled.shape}")
print(f"Sampled Training Labels Shape: {training_labels_sampled.shape}")

Sampled Training Data Shape: (2621, 96, 96, 3)
Sampled Training Labels Shape: (2621, 1, 1, 1)


In [20]:
import numpy as np
import cv2
import torch
from torchvision import models, transforms
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Preprocessing Functions
def normalize_pixel_values(image):
    """Normalize pixel values to [0, 1]."""
    return image / 255.0

def preprocess_image(image):
    """Apply preprocessing to a single image."""
    image = normalize_pixel_values(image)  # Normalize pixel values
    return image

# Preprocess Images
print("Preprocessing training data...")
training_data_preprocessed = np.array([preprocess_image(img) for img in training_data_sampled])

Preprocessing training data...


In [21]:
def extract_features(images):
    """Extract features from images using ResNet-50."""
    model = models.resnet50(pretrained=True)
    model.fc = torch.nn.Identity()  # Remove the classification head
    model.eval()

    features = []
    preprocess_pipeline = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    with torch.no_grad():
        for img in images:
            img = preprocess_pipeline(img).unsqueeze(0).float()  # Add batch dimension and convert to float32
            feature = model(img)
            features.append(feature.squeeze().numpy())  # Convert to numpy array
    return np.array(features)

print("Extracting features from training data...")
training_features = extract_features(training_data_preprocessed)

print("Preprocessing validation data...")
validation_data_preprocessed = np.array([preprocess_image(img) for img in val_data])

print("Extracting features from validation data...")
validation_features = extract_features(validation_data_preprocessed)


Extracting features from training data...




Preprocessing validation data...
Extracting features from validation data...


In [22]:
# Reshape features to 2D for compatibility with GradientBoostingClassifier
print("Reshaping features...")
training_features = training_features.reshape(training_features.shape[0], -1)
validation_features = validation_features.reshape(validation_features.shape[0], -1)

# Flatten labels to 1D
training_labels = training_labels.flatten()
validation_labels = val_labels.flatten()

# Train Gradient Boosting Machine
print("Training Gradient Boosting Machine...")
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

Reshaping features...
Training Gradient Boosting Machine...


In [23]:
# Ensure proper reshaping for training features and labels
print("Reshaping features and labels...")
training_features = training_features.reshape(training_features.shape[0], -1)
validation_features = validation_features.reshape(validation_features.shape[0], -1)

# Flatten the labels to 1D
training_labels_sampled = training_labels_sampled.flatten()
validation_labels = validation_labels.flatten()

gbm.fit(training_features, training_labels_sampled)

# Predict on Validation Data
print("Evaluating on validation data...")
validation_preds = gbm.predict(validation_features)
accuracy = accuracy_score(validation_labels, validation_preds)
print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(validation_labels, validation_preds))


Reshaping features and labels...
Evaluating on validation data...
Validation Accuracy: 0.8143
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.83      0.82     16399
           1       0.82      0.80      0.81     16369

    accuracy                           0.81     32768
   macro avg       0.81      0.81      0.81     32768
weighted avg       0.81      0.81      0.81     32768



### Gradient Boosting Machine w/ Reinhard Normalization

In [8]:
import numpy as np
import torch
from torchvision import transforms
import torchstain
import matplotlib.pyplot as plt 

def apply_reinhard_normalization(image, reference_image):
    """
    Apply Reinhard normalization to a single image with error handling.

    Parameters:
        image (np.ndarray): The image to normalize, shape (H, W, C) in RGB format.
        reference_image (np.ndarray): The reference image for normalization, shape (H, W, C) in RGB format.

    Returns:
        np.ndarray: The normalized image, shape (H, W, C) in normalized format.
        None: If normalization fails for any reason.
    """
    try:
        # Initialize the ReinhardNormalizer
        normalizer = torchstain.normalizers.ReinhardNormalizer()

        # Fit the normalizer with the reference image
        normalizer.fit(reference_image)

        # Normalize the image
        normalized_image = normalizer.normalize(image)

        # Return the normalized image
        return normalized_image

    except Exception as e:
        print(f"Unexpected error during Reinhard normalization: {e}")

    # Return None if normalization fails
    return None

In [9]:
import numpy as np
import cv2
import torch
from torchvision import models, transforms
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Preprocessing Functions
def normalize_pixel_values(image):
    """Normalize pixel values to [0, 1]."""
    return image / 255.0

def preprocess_image(image):
    """Apply preprocessing to a single image."""
    image = apply_reinhard_normalization(image, reference_image) # Stain normalization
    image = normalize_pixel_values(image)  # Normalize pixel values
    return image

# Preprocess Images
print("Preprocessing training data...")
training_data_preprocessed = np.array([preprocess_image(img) for img in training_data_sampled])


Preprocessing training data...


In [10]:
def extract_features(images):
    """Extract features from images using ResNet-50."""
    model = models.resnet50(pretrained=True)
    model.fc = torch.nn.Identity()  # Remove the classification head
    model.eval()

    features = []
    preprocess_pipeline = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    with torch.no_grad():
        for img in images:
            img = preprocess_pipeline(img).unsqueeze(0).float()  # Add batch dimension and convert to float32
            feature = model(img)
            features.append(feature.squeeze().numpy())  # Convert to numpy array
    return np.array(features)

print("Extracting features from training data...")
training_features = extract_features(training_data_preprocessed)

print("Preprocessing validation data...")
validation_data_preprocessed = np.array([preprocess_image(img) for img in val_data])

print("Extracting features from validation data...")
validation_features = extract_features(validation_data_preprocessed)


Extracting features from training data...
Preprocessing validation data...
Extracting features from validation data...


In [11]:
# Reshape features to 2D for compatibility with GradientBoostingClassifier
print("Reshaping features...")
training_features = training_features.reshape(training_features.shape[0], -1)
validation_features = validation_features.reshape(validation_features.shape[0], -1)

# Flatten labels to 1D
training_labels = training_labels.flatten()
validation_labels = val_labels.flatten()

# Train Gradient Boosting Machine
print("Training Gradient Boosting Machine...")
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

Reshaping features...
Training Gradient Boosting Machine...


In [12]:
# Ensure proper reshaping for training features and labels
print("Reshaping features and labels...")
training_features = training_features.reshape(training_features.shape[0], -1)
validation_features = validation_features.reshape(validation_features.shape[0], -1)

# Flatten the labels to 1D
training_labels_sampled = training_labels_sampled.flatten()
validation_labels = validation_labels.flatten()

gbm.fit(training_features, training_labels_sampled)

# Predict on Validation Data
print("Evaluating on validation data...")
validation_preds = gbm.predict(validation_features)
accuracy = accuracy_score(validation_labels, validation_preds)
print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(validation_labels, validation_preds))


Reshaping features and labels...
Evaluating on validation data...
Validation Accuracy: 0.7938
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.81      0.80     16399
           1       0.81      0.77      0.79     16369

    accuracy                           0.79     32768
   macro avg       0.79      0.79      0.79     32768
weighted avg       0.79      0.79      0.79     32768



### Gradient Boosting Machine w/ Macenko Normalization

In [13]:
def apply_macenko_normalization(image, reference_image):
    """
    Apply Macenko normalization to a single image with error handling.
    
    Parameters:
        image (np.ndarray): The image to normalize, shape (H, W, C) in RGB format.
        reference_image (np.ndarray): The reference image for normalization, shape (H, W, C) in RGB format.
    
    Returns:
        np.ndarray: The normalized image, shape (C, H, W) in normalized format.
        None: If normalization fails for any reason.
    """
    try:
        # Set up the transformation
        T = transforms.Compose([
            transforms.ToTensor(),
            transforms.Lambda(lambda x: x * 255)
        ])

        # Initialize the MacenkoNormalizer
        normalizer = torchstain.normalizers.MacenkoNormalizer(backend='torch')

        # Fit the normalizer with the reference image
        normalizer.fit(T(reference_image))

        # Transform the image and apply normalization
        t_to_transform = T(image)
        norm_img, _, _ = normalizer.normalize(I=t_to_transform, stains=True)

        # Return the normalized image
        return norm_img.numpy()

    except torch.linalg.LinAlgError as e:
        print(f"LinAlgError during normalization: {e}")
    except Exception as e:
        print(f"Unexpected error during normalization: {e}")

    # Return None if normalization fails
    return None

In [14]:
# Preprocessing Functions
def normalize_pixel_values(image):
    """Normalize pixel values to [0, 1]."""
    return image / 255.0

def preprocess_image(image):
    """Apply preprocessing to a single image."""
    image = apply_macenko_normalization(image, reference_image) # Stain normalization
    if image is None:
        return None
    image = normalize_pixel_values(image)  # Normalize pixel values
    return image

# Preprocess Images
print("Preprocessing training data...")
training_data_preprocessed = []
for img in training_data_sampled:
    preprocessed_img = preprocess_image(img)
    if preprocessed_img is not None:
        training_data_preprocessed.append(preprocessed_img)

# Convert the list of preprocessed images to a NumPy array
training_data_preprocessed = np.array(training_data_preprocessed)

print(f"Number of successfully preprocessed images: {len(training_data_preprocessed)}")


Preprocessing training data...
Number of successfully preprocessed images: 2621


In [16]:
print("Extracting features from training data...")
training_features = extract_features(training_data_preprocessed)

print("Preprocessing validation data...")
# Preprocess each image and filter out None
validation_data_preprocessed = []
failed_indices = []  # To log indices of failed images
for idx, img in enumerate(val_data):
    preprocessed_img = preprocess_image(img)
    if preprocessed_img is not None:
        validation_data_preprocessed.append(preprocessed_img)
    else:
        failed_indices.append(idx)

# Convert the list to a NumPy array
validation_data_preprocessed = np.array(validation_data_preprocessed)

print(f"Number of successfully preprocessed validation images: {len(validation_data_preprocessed)}")
print(f"Number of failed validation images: {len(failed_indices)}")

print("Extracting features from validation data...")
validation_features = extract_features(validation_data_preprocessed)



Extracting features from training data...
Preprocessing validation data...
LinAlgError during normalization: linalg.eigh: The algorithm failed to converge because the input matrix is ill-conditioned or has too many repeated eigenvalues (error code: 2).
Unexpected error during normalization: kthvalue(): Expected reduction dim 0 to have non-zero size.
Unexpected error during normalization: kthvalue(): Expected reduction dim 0 to have non-zero size.
Unexpected error during normalization: kthvalue(): Expected reduction dim 0 to have non-zero size.
LinAlgError during normalization: linalg.eigh: The algorithm failed to converge because the input matrix is ill-conditioned or has too many repeated eigenvalues (error code: 2).
Unexpected error during normalization: kthvalue(): Expected reduction dim 0 to have non-zero size.
Unexpected error during normalization: kthvalue(): Expected reduction dim 0 to have non-zero size.
Unexpected error during normalization: kthvalue(): Expected reduction dim 

In [17]:
# Reshape features to 2D for compatibility with GradientBoostingClassifier
print("Reshaping features...")
training_features = training_features.reshape(training_features.shape[0], -1)
validation_features = validation_features.reshape(validation_features.shape[0], -1)

# Flatten labels to 1D
training_labels = training_labels.flatten()
validation_labels = val_labels.flatten()

# Train Gradient Boosting Machine
print("Training Gradient Boosting Machine...")
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Ensure proper reshaping for training features and labels
print("Reshaping features and labels...")
training_features = training_features.reshape(training_features.shape[0], -1)
validation_features = validation_features.reshape(validation_features.shape[0], -1)

# Flatten the labels to 1D
training_labels_sampled = training_labels_sampled.flatten()
validation_labels = validation_labels.flatten()

gbm.fit(training_features, training_labels_sampled)

# Predict on Validation Data
print("Evaluating on validation data...")
validation_preds = gbm.predict(validation_features)
accuracy = accuracy_score(validation_labels, validation_preds)
print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(validation_labels, validation_preds))

Reshaping features...
Training Gradient Boosting Machine...
Reshaping features and labels...
Evaluating on validation data...


ValueError: Found input variables with inconsistent numbers of samples: [32768, 32715]

In [19]:
filtered_val_labels = np.delete(validation_labels, failed_indices)
accuracy = accuracy_score(filtered_val_labels, validation_preds)
print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(filtered_val_labels, validation_preds))

Validation Accuracy: 0.7913
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.82      0.80     16346
           1       0.81      0.76      0.78     16369

    accuracy                           0.79     32715
   macro avg       0.79      0.79      0.79     32715
weighted avg       0.79      0.79      0.79     32715



### CLAHE + Reinhard

In [26]:
def apply_clahe(image):
    """Apply CLAHE (Adaptive Histogram Equalization) to enhance contrast."""
    # Convert to LAB color space
    lab_image = cv2.cvtColor(image, cv2.COLOR_RGB2LAB)
    l_channel, a, b = cv2.split(lab_image)

    # Apply CLAHE to the L channel
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    l_channel = clahe.apply(l_channel)

    # Merge and convert back to RGB
    lab_image = cv2.merge((l_channel, a, b))
    return cv2.cvtColor(lab_image, cv2.COLOR_LAB2RGB)

In [27]:
import numpy as np
import cv2
import torch
from torchvision import models, transforms
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Preprocessing Functions
def normalize_pixel_values(image):
    """Normalize pixel values to [0, 1]."""
    return image / 255.0

def preprocess_image(image):
    """Apply preprocessing to a single image."""
    image = apply_reinhard_normalization(image, reference_image) # Stain normalization
    image = apply_clahe(image)
    image = normalize_pixel_values(image)  # Normalize pixel values
    return image

# Preprocess Images
print("Preprocessing training data...")
training_data_preprocessed = np.array([preprocess_image(img) for img in training_data_sampled])

Preprocessing training data...


In [28]:
print("Extracting features from training data...")
training_features = extract_features(training_data_preprocessed)

print("Preprocessing validation data...")
validation_data_preprocessed = np.array([preprocess_image(img) for img in val_data])

print("Extracting features from validation data...")
validation_features = extract_features(validation_data_preprocessed)

# Reshape features to 2D for compatibility with GradientBoostingClassifier
print("Reshaping features...")
training_features = training_features.reshape(training_features.shape[0], -1)
validation_features = validation_features.reshape(validation_features.shape[0], -1)

# Flatten labels to 1D
training_labels = training_labels.flatten()
validation_labels = val_labels.flatten()

# Train Gradient Boosting Machine
print("Training Gradient Boosting Machine...")
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
# Ensure proper reshaping for training features and labels
print("Reshaping features and labels...")
training_features = training_features.reshape(training_features.shape[0], -1)
validation_features = validation_features.reshape(validation_features.shape[0], -1)

# Flatten the labels to 1D
training_labels_sampled = training_labels_sampled.flatten()
validation_labels = validation_labels.flatten()

gbm.fit(training_features, training_labels_sampled)

# Predict on Validation Data
print("Evaluating on validation data...")
validation_preds = gbm.predict(validation_features)
accuracy = accuracy_score(validation_labels, validation_preds)
print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(validation_labels, validation_preds))

Extracting features from training data...




Preprocessing validation data...
Extracting features from validation data...
Reshaping features...
Training Gradient Boosting Machine...
Reshaping features and labels...
Evaluating on validation data...
Validation Accuracy: 0.7993
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.81      0.80     16399
           1       0.80      0.79      0.80     16369

    accuracy                           0.80     32768
   macro avg       0.80      0.80      0.80     32768
weighted avg       0.80      0.80      0.80     32768



#### Tune Parameters

In [29]:
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.6, 0.8, 1.0],
}

gbm = GradientBoostingClassifier()
random_search = RandomizedSearchCV(estimator=gbm, param_distributions=param_distributions, n_iter=20, cv=3, scoring='accuracy', random_state=42)
random_search.fit(training_features, training_labels_sampled)

# Predict on Validation Data
print("Evaluating on validation data...")
validation_preds = random_search.predict(validation_features)
accuracy = accuracy_score(validation_labels, validation_preds)
print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(validation_labels, validation_preds))

Evaluating on validation data...
Validation Accuracy: 0.8112
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.82      0.81     16399
           1       0.82      0.80      0.81     16369

    accuracy                           0.81     32768
   macro avg       0.81      0.81      0.81     32768
weighted avg       0.81      0.81      0.81     32768



In [31]:
print(random_search.best_params_)

{'subsample': 0.8, 'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 7, 'learning_rate': 0.1}
