# Preprocessing BUSI dataset

# Important libraries

In [1]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import glob
from scipy import ndimage

# Preprocessing class with all needed functions

## Core Methods
1. load_image(image_path)

Loads an image using OpenCV. Converts to grayscale if needed.

2. add_corner_triangle_mask(img)

Adds a black triangular mask in the top-left corner to remove ultrasound annotations.

3. remove_annotations(img)

Removes bright annotations (e.g., text/drawings) using inpainting.

4. enhance_contrast(img)

Applies CLAHE (adaptive histogram equalization) to improve image contrast.

5. resize_with_padding(img, target_size)

Resizes an image while preserving aspect ratio. Pads with black pixels to match target_size.



-------------------------------------------------------------------------------------------------------------------------------------------------------

## Image Processing Pipeline
6. process_image(image_path, class_name, save=True)

Processes a single image using the full pipeline:

Loads the image

Removes corner mask and annotations

Enhances contrast

Resizes to uniform dimensions

Optionally saves the processed image

7. process_mask_image(mask_path, class_name, save=True)

Only resizes the corresponding segmentation mask. No filtering applied.

-------------------------------------------------------------------------------------------------------------------------------------------------------

## Dataset Utilities
8. get_image_files(class_folder)

Retrieves all .png images from a class folder and separates image files from mask files.

9. process_all_images()

Applies the processing pipeline to all images and masks in the BUSI dataset folders.

10. create_dataset(combine_masks=True)

Creates the final dataset for CNN training:

X: Processed images

y: Corresponding labels (normal=0, benign=1, malignant=2)

masks: Optional segmentation masks

Supports combining multiple masks per image using np.maximum.reduce

Returns:

X: np.ndarray of shape (N, 224, 224, 1)

y: np.ndarray of shape (N,)

masks: np.ndarray of shape (N, 224, 224, 1)


-------------------------------------------------------------------------------------------------------------------------------------------------------

## Notes

- The order of preprocessing operations is carefully chosen for robustness.

- Blank masks are automatically generated if none are found.

- Prints dataset statistics after creation.

In [4]:
class BUSIPreprocessor:
    def __init__(self, data_dir, output_dir, img_size=(224, 224)):
        """
        Initialize the BUSI dataset preprocessor.
        
        Args:
            data_dir: Directory containing the BUSI dataset with benign, malignant, normal folders
            output_dir: Directory to save processed images
            img_size: Target size for the processed images
        """
        self.data_dir = data_dir
        self.output_dir = output_dir
        self.img_size = img_size
        
        # Create output directories if they don't exist
        for class_name in ['benign', 'malignant', 'normal']:
            os.makedirs(os.path.join(output_dir, "processed_images", class_name), exist_ok=True)
        
        # Define class mapping
        self.class_mapping = {
            'normal': 0,
            'benign': 1,
            'malignant': 2
        }
        
        
    # loading an image and grayscaling it if necessary
    def load_image(self, image_path):
        """Load an image and convert to grayscale if needed."""
        img = cv2.imread(image_path)
        if img is None:
            raise ValueError(f"Could not load image: {image_path}")
        
        # Convert to grayscale if it's a color image
        if len(img.shape) == 3:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        return img

    

    # Annotations removal (letters, texts, drawings) & top left corner annotation removal
    def add_corner_triangle_mask(self, img):
        """
        Adds a black triangle mask to the left corner of the image
        """
        height, width = img.shape[:2]
        
        # Create a copy of the input image
        result = img.copy()
        
        # Define triangle vertices (adjust these coordinates as needed)
        # Format: top-left corner, bottom-left corner, and a point to the right
        triangle_pts = np.array([[0, 0], [0, height//16], [width//16, 0]], np.int32)
        
        # Fill the triangle with black (0)
        cv2.fillPoly(result, [triangle_pts], 0)
        
        return result

    def remove_annotations(self, img):
        # Threshold for bright pixels
        _, mask = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
    
        # Morphological operations to remove thin text/crosses
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
        mask = cv2.dilate(mask, kernel, iterations=1)
    
        # Inpaint
        inpainted = cv2.inpaint(img, mask, inpaintRadius=1, flags=cv2.INPAINT_TELEA)
    
        return inpainted


        
    # Filters applied to the BUSI images
    def enhance_contrast(self, img):
        """
        Enhance image contrast using adaptive CLAHE,
        tuned for ultrasound images.
        """
        if img.dtype != np.uint8:
            img = (img * 255).astype(np.uint8)
    
        # Adaptive CLAHE parameters for ultrasound images
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(img)
        return enhanced
    
    


    # Resizing to a standard form, to help avoid CNN bias
    def resize_with_padding(self, img, target_size):
        h, w = img.shape[:2]
        target_h, target_w = target_size
    
        # Compute scale and new size
        scale = min(target_w / w, target_h / h)
        new_w, new_h = int(w * scale), int(h * scale)
    
        # Resize while preserving aspect ratio
        resized_img = cv2.resize(img, (new_w, new_h))
    
        # Create a black canvas
        padded_img = np.zeros((target_h, target_w), dtype=resized_img.dtype)
    
        # Compute padding offsets
        x_offset = (target_w - new_w) // 2
        y_offset = (target_h - new_h) // 2
    
        # Place the resized image on the canvas
        padded_img[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized_img
    
        return padded_img



        
    # All steps in processing an image (the order is important for efficiency and reliability)
    def process_image(self, image_path, class_name, save=True):
        """Process a single ultrasound image."""
    
        # Extract the image filename
        filename = os.path.basename(image_path)
        name_without_ext = os.path.splitext(filename)[0]
        
        # Load the image
        original_img = self.load_image(image_path)

        #  Top left corner annotation removal & annotations removal (letters, texts, drawings)
        img_left_corner_removed = self.add_corner_triangle_mask(original_img)
        img_reduced = self.remove_annotations(img_left_corner_removed)
        
        # Filters
        img_enhanced = self.enhance_contrast(img_reduced) # Enhance contrast
        
        # Resize the image to the target size (224,224) -> useful for the CNN 
        img_resized = self.resize_with_padding(img_enhanced, self.img_size)
        
        # Save the processed image
        if save:
            output_path = os.path.join(self.output_dir, "processed_images", class_name, 
                                     f"{name_without_ext}_processed.png")
            cv2.imwrite(output_path, img_resized)
        
        return {
            'filename': filename,
            'class': class_name,
            'processed_image': img_resized
        }

    # For mask, only resing is needed    
    def process_mask_image(self, mask_path, class_name, save=True):
        """Process a mask image without any modifications."""
        # Extract the mask filename
        filename = os.path.basename(mask_path)
        name_without_ext = os.path.splitext(filename)[0]
        
        # Load the mask image
        mask_img = self.load_image(mask_path)
        
        # Only resize the mask to match the target size, no other processing
        mask_resized = self.resize_with_padding(mask_img, self.img_size)
        
        # Save the mask image
        if save:
            output_path = os.path.join(self.output_dir, "processed_images", class_name, filename)
            cv2.imwrite(output_path, mask_resized)
        
        return {
            'filename': filename,
            'class': class_name,
            'mask_image': mask_resized
        }

        
    
    def get_image_files(self, class_folder):
        """
        Get all image files from a class folder, excluding mask files.
        Returns both regular images and their corresponding mask files (if they exist).
        """
        image_files = []
        mask_files = []
        
        files = glob.glob(os.path.join(class_folder, '*.png'))
        
        # Separate regular images and mask images
        for f in files:
            if '_mask' in os.path.basename(f).lower():
                mask_files.append(f)
            else:
                image_files.append(f)
        
        return image_files, mask_files


    def process_all_images(self, label):
        """Process all images and their masks in the dataset."""
        image_results = []
        mask_results = []

        
        class_folder = os.path.join(self.data_dir, label)
        
        if not os.path.exists(class_folder):
            print(f"Warning: Folder {class_folder} does not exist. Skipping...")
        
        # Get both regular images and mask images
        image_files, mask_files = self.get_image_files(class_folder)
        print(f"Found {len(image_files)} images and {len(mask_files)} masks in {label} folder")
        
        # Process regular images
        for image_path in tqdm(image_files, desc=f"Processing {label} images"):
            try:
                result = self.process_image(image_path, label)
                image_results.append(result)
            except Exception as e:
                print(f"Error processing {image_path}: {e}")
        
        # Process mask images (without modifications)
        for mask_path in tqdm(mask_files, desc=f"Processing {label} masks"):
            try:
                result = self.process_mask_image(mask_path, label)
                mask_results.append(result)
            except Exception as e:
                print(f"Error processing mask {mask_path}: {e}")
        
        return image_results, mask_results

    
    def create_dataset(self, label, combine_masks=True):
        """Create a dataset for training a CNN."""
        image_data, mask_data = self.process_all_images(label)
    
        # Create dictionaries for easy matching - supports multiple masks per image
        mask_dict = {}
        for mask in mask_data:
            # Extract base filename without _mask suffix and any additional suffixes like _1
            mask_filename = mask['filename']
            # Remove file extension first
            base_name = os.path.splitext(mask_filename)[0]
            # Remove _mask and any additional suffixes like _1, _2, etc.
            if '_mask' in base_name:
                base_name = base_name.split('_mask')[0]
            
            # Store multiple masks per image in a list
            if base_name not in mask_dict:
                mask_dict[base_name] = []
            mask_dict[base_name].append(mask['mask_image'])
        
        # Create X (images), y (labels), and masks
        X = []
        y = []
        masks = []
        filenames = []
        
        for data in image_data:
            # Extract base name from the current image
            img_filename = data['filename']
            img_base_name = os.path.splitext(img_filename)[0]
            

            X.append(data['processed_image'])
            
            y.append(self.class_mapping[data['class']])
            
            # Get corresponding masks
            if img_base_name in mask_dict:
                img_masks = mask_dict[img_base_name]
                
                if len(img_masks) == 1:
                    masks.append(img_masks[0])
                elif combine_masks:
                    # Combine multiple masks by taking the maximum value at each pixel
                    combined_mask = np.maximum.reduce(img_masks)
                    masks.append(combined_mask)
                    print(f"Combined {len(img_masks)} masks for {img_filename}")
                else:
                    # Just use the first mask if not combining
                    masks.append(img_masks[0])
                    print(f"Using first mask out of {len(img_masks)} for {img_filename}")
            else:
                # If no mask found, create a blank mask
                blank_mask = np.zeros(self.img_size, dtype=np.uint8)
                masks.append(blank_mask)
                print(f"Warning: No mask found for {img_filename}, using blank mask")
            
            filenames.append(data['filename'])
        
        # Convert to numpy arrays
        X = np.array(X)
        y = np.array(y)
        masks = np.array(masks)
    
        # Add channel dimension if needed (for CNN)
        if len(X.shape) == 3:
            X = np.expand_dims(X, axis=-1)
        if len(masks.shape) == 3:
            masks = np.expand_dims(masks, axis=-1)
        
        print(f"Dataset shape: {X.shape}")
        print(f"Masks shape: {masks.shape}")
        print(f"Labels shape: {y.shape}")
        print(f"Class distribution: Normal: {np.sum(y == 0)}, Benign: {np.sum(y == 1)}, Malignant: {np.sum(y == 2)}")
        
        return X, y, masks
