# **Classification of Blood Clot Origins in Ischemic Strokes 🩸**

The purpose of the present project is classifying the etiology of blood clots in whole-slide digital pathology images, specifically identifying whether they are of Cardioembolic (CE) or Large Artery Atherosclerosis (LAA) origin. Previosly, through an extensive exploratory data analysis (EDA), we described the dataset, analyzed missing and duplicate values, examined the distribution of image sizes, classified variables, and reviewed the label distribution for the training set, along with plenty of other analysis. 

In the current notebook, we preprocess the images to standardize them for model input. The preprocessing involves resizing, converting images to RBG, normalizing pixel values, and discard images that are mainly background. These steps ensure that the images are suitable for the models by preparing them with a consistent size, format, and reduced noise, enabling more efficient training and improved classification accuracy.

The image pre-processing detailed here serves primarily as a refined, second iteration, tailored specifically for the model in `model_notebooks/Modelo_EfficientNetwork.ipynb`. For a simpler, preliminary approach, refer to the initial EDA and preprocessing outlined in `model_notebooks/mayo.clinic.strip.ai.upynb`.

**Authors:**
- [Daniel Valdez](https://github.com/Danval-003)
- [Emilio Solano](https://github.com/emiliosolanoo21)
- [Adrian Flores](https://github.com/adrianRFlores)
- [Andrea Ramírez](https://github.com/Andrea-gt)

***

## **(1) Import Libraries** ⬇️

In [None]:
# ===== Standard Libraries =====
import os  # OS utilities
import numpy as np  # Numerical computations
import pandas as pd  # Data manipulation

# ===== Image Processing =====
import cv2  # OpenCV for image processing
import openslide  # For handling digital pathology slide images

# ===== Machine Learning =====
from sklearn.model_selection import StratifiedKFold  # Stratified K-Folds cross-validator

# ===== Multiprocessing =====
from multiprocessing import Pool  # Parallel processing for improved performance

# ===== Data Augmentation =====
import albumentations as A  # Data augmentation library
from albumentations.pytorch import ToTensorV2  # Converts images to PyTorch tensors

## **(2) Second Iteration of Image Preprocessing 📷**

### **(1) Data Augmentation Techniques**

In [None]:
# Define advanced augmentations to increase variability in data, particularly useful for minority classes
minority_augmentations = A.Compose([
    A.HorizontalFlip(p=0.8),  # Apply horizontal flip with 80% probability
    A.VerticalFlip(p=0.8),  # Apply vertical flip with 80% probability
    A.RandomRotate90(p=0.8),  # Randomly rotate image by 90 degrees with 80% probability
    A.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2, p=0.8),  # Randomly adjust brightness, contrast, saturation, and hue with 80% probability
    A.Perspective(p=0.5),  # Apply perspective transformation with 50% probability
    A.RandomBrightnessContrast(p=0.6),  # Randomly adjust brightness and contrast with 60% probability
    A.CoarseDropout(max_holes=10, max_height=20, max_width=20, min_holes=1, p=0.5),  # Randomly drop small rectangular regions in the image with 50% probability to introduce noise
    A.Resize(256, 256),  # Resize image to 256x256 pixels
    ToTensorV2()  # Convert image to PyTorch tensor
])

In [None]:
# Define augmentations for majority classes to introduce variability without excessive transformations
majority_augmentations = A.Compose([
    A.HorizontalFlip(p=0.5),  # Apply horizontal flip with 50% probability
    A.VerticalFlip(p=0.5),  # Apply vertical flip with 50% probability
    A.RandomRotate90(p=0.5),  # Randomly rotate image by 90 degrees with 50% probability
    A.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.2, p=0.5),  # Adjust brightness, contrast, saturation, and hue with 50% probability
    A.Resize(256, 256),  # Resize image to 256x256 pixels
    ToTensorV2()  # Convert image to PyTorch tensor
])

### **(2) Handling Patches With Mostly Background**

In [None]:
# Check if an image patch is valid based on its grayscale intensity variation
def is_valid_patch(img, threshold=15):
    # Convert the image to grayscale for easier analysis of intensity variation
    grayscale = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    
    # Calculate the standard deviation of pixel intensity in the grayscale image
    # If the standard deviation meets or exceeds the threshold, the patch is considered valid
    return np.std(grayscale) >= threshold

In [None]:
# Check if an image patch is likely to be background based on intensity variation and color differences
def is_background_patch(img, std_threshold=15, mean_diff_threshold=10):
    # Convert the image to grayscale for analyzing intensity variation
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    # If the standard deviation of the grayscale image is below the threshold, classify it as background
    if np.std(gray) < std_threshold:
        return True
    # Calculate the mean color values for each channel (R, G, B)
    mean_r, mean_g, mean_b = img[..., 0].mean(), img[..., 1].mean(), img[..., 2].mean()
    # Check if the maximum difference between the mean color values is below the specified threshold
    # If so, classify the patch as background (indicating little color variation)
    return max(abs(mean_r - mean_g), abs(mean_r - mean_b), abs(mean_g - mean_b)) < mean_diff_threshold

### **(3) Obtaining Image Patches**

In [None]:
# Process image patches and apply balanced augmentations
def preprocess_image(args):
    image_path, label, idx, num_patches = args
    print(f"Processing image at index {idx}")
    
    slide = openslide.OpenSlide(image_path)  # Open the image slide
    width, height = slide.dimensions  # Get slide dimensions
    patches = []  # Initialize list to store valid patches
    
    patch_size = 512  # Define the size of each patch
    step_size = patch_size // 2  # Define the step size for patch extraction
    max_attempts = num_patches * 30  # Maximum attempts to find valid patches
    
    attempts = 0  # Initialize attempt counter
    for y in range(0, height - patch_size + 1, step_size):
        for x in range(0, width - patch_size + 1, step_size):
            if len(patches) >= num_patches:  # Stop if enough patches are collected
                break
            
            img = slide.read_region((x, y), 0, (patch_size, patch_size)).convert("RGB")  # Read the patch
            img = np.array(img)  # Convert image to numpy array
            
            if is_background_patch(img):  # Check if the patch is likely background
                continue
            
            if is_valid_patch(img):  # Check if the patch is valid
                if label == 'LAA':  # Minority class
                    augmented = minority_augmentations(image=img)  # Apply minority augmentations
                    patches.extend([augmented['image']] * 2)  # Add duplicates
                else:  # Majority class
                    augmented = majority_augmentations(image=img)  # Apply majority augmentations
                    patches.append(augmented['image'])  # Add the augmented image
                
                attempts = 0  # Reset attempts after successful patch extraction
            else:
                attempts += 1  # Increment attempts if patch is not valid
            
            if attempts >= max_attempts:  # Check if max attempts exceeded
                print(f"Maximum attempts exceeded for image {image_path}")
                break
    
    if len(patches) < num_patches:  # Check if sufficient valid patches were found
        print(f"Not enough valid patches found for the image at {image_path}")
        return None, None  # Return None if not enough patches were found
    
    patches_tensor = np.stack(patches[:num_patches], axis=0)  # Balance the final number of patches
    return patches_tensor, label  # Return the tensor of patches and the label

### **(4) Saving Images Patches in Directory**

In [None]:
# Process and save dataset in chunks
def create_and_save_dataset_in_chunks(df, dataset_type, num_patches_per_image=15, chunk_size=5000):
    # Define the directory for images based on the dataset type (train or test)
    images_dir = '/kaggle/input/mayo-clinic-strip-ai/test' if dataset_type == 'test' else '/kaggle/input/mayo-clinic-strip-ai/train'
    
    all_images = []  # List to store all processed images
    all_labels = []  # List to store labels (if not test dataset)
    total_samples = 0  # Counter for total samples processed
    chunk_index = 0  # Index for the current chunk
    
    args_list = []  # List to hold arguments for processing
    for idx, row in df.iterrows():  # Iterate over each row in the DataFrame
        # Construct the image path based on the dataset type
        image_path = os.path.join(images_dir, f"{row['image_id']}.tif") if dataset_type == 'test' else row['image_path']
        label = None if dataset_type == 'test' else row['label']  # Get label only for training dataset
        args_list.append((image_path, label, idx, num_patches_per_image))  # Append arguments for processing
    
    # Use multiprocessing to process images in parallel
    with Pool(processes=os.cpu_count()) as pool:
        for patches, label in pool.imap(preprocess_image, args_list):  # Process each image
            if patches is not None:  # If valid patches are returned
                all_images.append(patches)  # Add patches to the image list
                if dataset_type != 'test':  # Append label only for training dataset
                    all_labels.append(label)
                
                total_samples += 1  # Increment sample counter
                
                # Save the chunk if the limit is reached
                if total_samples >= chunk_size:
                    if dataset_type != 'test':
                        save_chunk(np.array(all_images), np.array(all_labels), dataset_type, chunk_index)  # Save training chunk
                    else:
                        save_test_chunk(np.array(all_images), chunk_index)  # Save test chunk
                    chunk_index += 1  # Increment chunk index
                    all_images = []  # Reset images list
                    all_labels = []  # Reset labels list
                    total_samples = 0  # Reset sample counter
    
    # Save any remaining samples after exiting the loop
    if total_samples > 0:
        if dataset_type != 'test':
            save_chunk(np.array(all_images), np.array(all_labels), dataset_type, chunk_index)  # Save remaining training chunk
        else:
            save_test_chunk(np.array(all_images), chunk_index)  # Save remaining test chunk

In [None]:
# Save training and validation chunks
def save_chunk(images, labels, dataset_type, chunk_index):
    # Save images and labels as .npy files
    np.save(f'X_{dataset_type}_chunk_{chunk_index}.npy', images)
    np.save(f'y_{dataset_type}_chunk_{chunk_index}.npy', labels)
    print(f'Saved {len(images)} samples in X_{dataset_type}_chunk_{chunk_index}.npy and y_{dataset_type}_chunk_{chunk_index}.npy')

In [None]:
# Save test dataset chunks
def save_test_chunk(images, chunk_index):
    # Save test images as .npy file
    np.save(f'X_test_chunk_{chunk_index}.npy', images)
    print(f'Saved {len(images)} samples in X_test_chunk_{chunk_index}.npy')

### **(5) Implementing Functions and Generating Data**

In [None]:
# Load the training CSV file containing image metadata
train_csv_path = '/kaggle/input/mayo-clinic-strip-ai/train.csv'
df_train = pd.read_csv(train_csv_path)  # Read the CSV into a DataFrame

# Define the directory containing training images
train_images_dir = '/kaggle/input/mayo-clinic-strip-ai/train'

# Create the full image paths by appending the directory to the image IDs
df_train['image_path'] = df_train['image_id'].apply(lambda x: os.path.join(train_images_dir, f"{x}.tif"))

# Rename the target column to label for clarity
df_train.rename(columns={'target': 'label'}, inplace=True)

In [None]:
# Create stratified K-Folds to maintain label distribution across folds
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(kf.split(df_train, df_train['label'])):
    print(f'Processing fold {fold + 1}')  # Indicate which fold is being processed
    
    # Split the DataFrame into training and validation sets for the current fold
    df_train_fold = df_train.iloc[train_idx]
    df_val_fold = df_train.iloc[val_idx]
    
    # Create and save training dataset chunks for the current fold
    create_and_save_dataset_in_chunks(df_train_fold, dataset_type=f'train_fold{fold}', num_patches_per_image=15, chunk_size=5000)
    
    # Create and save validation dataset chunks for the current fold
    create_and_save_dataset_in_chunks(df_val_fold, dataset_type=f'val_fold{fold}', num_patches_per_image=15, chunk_size=5000)

In [None]:
# Load the test CSV file for the test dataset
test_csv_path = '/kaggle/input/mayo-clinic-strip-ai/test.csv'
df_test = pd.read_csv(test_csv_path)  # Read the test CSV into a DataFrame

# Create and save dataset chunks for the test dataset
create_and_save_dataset_in_chunks(df_test, dataset_type='test', num_patches_per_image=15, chunk_size=5000)