# Preprocessing BUSI dataset

In [17]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import glob
from scipy import ndimage

class BUSIPreprocessor:
    def __init__(self, data_dir, output_dir, img_size=(224, 224)):
        """
        Initialize the BUSI dataset preprocessor.
        
        Args:
            data_dir: Directory containing the BUSI dataset with benign, malignant, normal folders
            output_dir: Directory to save processed images
            img_size: Target size for the processed images
        """
        self.data_dir = data_dir
        self.output_dir = output_dir
        self.img_size = img_size
        
        # Create output directories if they don't exist
        for class_name in ['benign', 'malignant', 'normal']:
            os.makedirs(os.path.join(output_dir, "processed_images", class_name), exist_ok=True)
        
        # Define class mapping
        self.class_mapping = {
            'normal': 0,
            'benign': 1,
            'malignant': 2
        }
    
    def load_image(self, image_path):
        """Load an image and convert to grayscale if needed."""
        img = cv2.imread(image_path)
        if img is None:
            raise ValueError(f"Could not load image: {image_path}")
        
        # Convert to grayscale if it's a color image
        if len(img.shape) == 3:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        return img
    
    def enhance_contrast(self, img):
        """
        Enhance image contrast using adaptive CLAHE,
        tuned for ultrasound images.
        """
        if img.dtype != np.uint8:
            img = (img * 255).astype(np.uint8)
    
        # Adaptive CLAHE parameters for ultrasound images
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(img)
        return enhanced
    
    # def denoise_image(self, img):
    #     """
    #     Apply denoising to reduce speckle noise common in ultrasound images.
    #     """
    #     # Apply Non-local Means Denoising
    #     denoised = cv2.fastNlMeansDenoising(img, None, 3, 7, 21)
    #     return denoised
    
    def normalize_image(self, img):
        """
        Normalize image to [0, 1] range.
        """
        img = img.astype(np.float32)
        
        # Normalize to 0-1 range
        img_min = np.min(img)
        img_max = np.max(img)
        
        if img_max > img_min:
            img = (img - img_min) / (img_max - img_min)
        
        return img
    


    def soften_annotations(self, img):
        """
        Alternative: Morphological closing to remove thin bright text/lines.
        Works better if you don't want to risk losing edges.
        """
        # Threshold for bright pixels
        _, mask = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
    
        # Morphological operations to remove thin text/crosses
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
        mask = cv2.dilate(mask, kernel, iterations=1)
    
        # Inpaint
        inpainted = cv2.inpaint(img, mask, inpaintRadius=1, flags=cv2.INPAINT_TELEA)
    
        return inpainted



    def resize_with_padding(self, img, target_size):
        h, w = img.shape[:2]
        target_h, target_w = target_size
    
        # Compute scale and new size
        scale = min(target_w / w, target_h / h)
        new_w, new_h = int(w * scale), int(h * scale)
    
        # Resize while preserving aspect ratio
        resized_img = cv2.resize(img, (new_w, new_h))
    
        # Create a black canvas
        padded_img = np.zeros((target_h, target_w), dtype=resized_img.dtype)
    
        # Compute padding offsets
        x_offset = (target_w - new_w) // 2
        y_offset = (target_h - new_h) // 2
    
        # Place the resized image on the canvas
        padded_img[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized_img
    
        return padded_img

    def add_corner_triangle_mask(self, img):
        """
        Adds a black triangle mask to the left corner of the image
        """
        height, width = img.shape[:2]
        
        # Create a copy of the input image
        result = img.copy()
        
        # Define triangle vertices (adjust these coordinates as needed)
        # Format: top-left corner, bottom-left corner, and a point to the right
        triangle_pts = np.array([[0, 0], [0, height//16], [width//16, 0]], np.int32)
        
        # Fill the triangle with black (0)
        cv2.fillPoly(result, [triangle_pts], 0)
        
        return result

    
    def process_image(self, image_path, class_name, save=True):
        """Process a single ultrasound image."""
        # Extract the image filename
        filename = os.path.basename(image_path)
        name_without_ext = os.path.splitext(filename)[0]
        
        # Load the image
        original_img = self.load_image(image_path)

        img_left_corner_removed = self.add_corner_triangle_mask(original_img)

        img_reduced = self.soften_annotations(img_left_corner_removed)
        
        
        # Enhance contrast
        img_enhanced = self.enhance_contrast(img_reduced)
        
        # Normalize the image
        img_normalized = self.normalize_image(img_enhanced)
        
        # Resize the image to the target size
        img_resized = self.resize_with_padding(img_enhanced, self.img_size)
        
        # Save the processed image
        if save:
            output_path = os.path.join(self.output_dir, "processed_images", class_name, 
                                     f"{name_without_ext}_processed.png")
            cv2.imwrite(output_path, img_resized)
        
        return {
            'filename': filename,
            'class': class_name,
            'processed_image': img_resized,
            'normalized_image': self.resize_with_padding((img_normalized * 255).astype(np.uint8), self.img_size)
        }
    
    def get_image_files(self, class_folder):
        """
        Get all image files from a class folder, excluding mask files.
        Returns both regular images and their corresponding mask files (if they exist).
        """
        image_extensions = ['*.png', '*.jpg', '*.jpeg', '*.bmp', '*.tiff']
        image_files = []
        mask_files = []
        
        for ext in image_extensions:
            files = glob.glob(os.path.join(class_folder, ext))
            
            # Separate regular images and mask images
            for f in files:
                if '_mask' in os.path.basename(f).lower():
                    mask_files.append(f)
                else:
                    image_files.append(f)
        
        return image_files, mask_files

    def process_mask_image(self, mask_path, class_name, save=True):
        """Process a mask image without any modifications."""
        # Extract the mask filename
        filename = os.path.basename(mask_path)
        name_without_ext = os.path.splitext(filename)[0]
        
        # Load the mask image
        mask_img = self.load_image(mask_path)
        
        # Only resize the mask to match the target size, no other processing
        mask_resized = self.resize_with_padding(mask_img, self.img_size)
        
        # Save the mask image
        if save:
            output_path = os.path.join(self.output_dir, "processed_images", class_name, filename)
            cv2.imwrite(output_path, mask_resized)
        
        return {
            'filename': filename,
            'class': class_name,
            'mask_image': mask_resized
        }

    def process_all_images(self):
        """Process all images and their masks in the dataset."""
        results = []
        
        for class_name in ['malignant', 'normal', 'benign']:
            class_folder = os.path.join(self.data_dir, class_name)
            
            if not os.path.exists(class_folder):
                print(f"Warning: Folder {class_folder} does not exist. Skipping...")
                continue
            
            # Get both regular images and mask images
            image_files, mask_files = self.get_image_files(class_folder)
            print(f"Found {len(image_files)} images and {len(mask_files)} masks in {class_name} folder")
            
            # Process regular images
            for image_path in tqdm(image_files, desc=f"Processing {class_name} images"):
                try:
                    result = self.process_image(image_path, class_name)
                    results.append(result)
                except Exception as e:
                    print(f"Error processing {image_path}: {e}")
            
            # Process mask images (without modifications)
            for mask_path in tqdm(mask_files, desc=f"Processing {class_name} masks"):
                try:
                    result = self.process_mask_image(mask_path, class_name)
                    # We don't add mask results to the main results list
                except Exception as e:
                    print(f"Error processing mask {mask_path}: {e}")
        
        return results

    
    def create_dataset(self, use_normalized=True):
        """Create a dataset for training a CNN."""
        processed_data = self.process_all_images()
        
        # Create X (images) and y (labels)
        X = []
        y = []
        filenames = []
        
        for data in processed_data:
            if use_normalized:
                X.append(data['normalized_image'])
            else:
                X.append(data['processed_image'])
            
            y.append(self.class_mapping[data['class']])
            filenames.append(data['filename'])
        
        # Convert to numpy arrays
        X = np.array(X)
        y = np.array(y)
        
        # Add channel dimension if needed (for CNN)
        if len(X.shape) == 3:
            X = np.expand_dims(X, axis=-1)
        
        print(f"Dataset shape: {X.shape}")
        print(f"Labels shape: {y.shape}")
        print(f"Class distribution: Normal: {np.sum(y == 0)}, Benign: {np.sum(y == 1)}, Malignant: {np.sum(y == 2)}")
        
        # Split into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        return X_train, X_val, y_train, y_val, filenames


def data_augmentation(X_train, y_train, augmentation_factor=2):
    """
    Apply data augmentation to the training set.
    
    Args:
        X_train: Training images
        y_train: Training labels
        augmentation_factor: How many times to augment the dataset
        
    Returns:
        Augmented training set
    """
    X_augmented = []
    y_augmented = []
    
    # Add original data
    X_augmented.extend(X_train)
    y_augmented.extend(y_train)
    
    for i in range(len(X_train)):
        img = X_train[i]
        label = y_train[i]
        
        # Remove channel dimension for processing if present
        if len(img.shape) == 3 and img.shape[-1] == 1:
            img_2d = img.squeeze(-1)
        else:
            img_2d = img
        
        # Augment abnormal cases (benign and malignant) more
        aug_factor = augmentation_factor if label > 0 else 1
        
        for _ in range(aug_factor):
            # Randomly choose augmentation techniques
            augmentation_type = np.random.choice(['flip', 'rotate', 'zoom', 'noise', 'brightness'])
            
            if augmentation_type == 'flip':
                # Horizontal flip
                augmented_img = cv2.flip(img_2d, 1)
            
            elif augmentation_type == 'rotate':
                # Random rotation between -15 and 15 degrees
                angle = np.random.uniform(-15, 15)
                augmented_img = ndimage.rotate(img_2d, angle, reshape=False)
            
            elif augmentation_type == 'zoom':
                # Random zoom between 0.9 and 1.1
                zoom_factor = np.random.uniform(0.9, 1.1)
                h, w = img_2d.shape[:2]
                
                # Calculate new dimensions
                new_h, new_w = int(h * zoom_factor), int(w * zoom_factor)
                
                if zoom_factor < 1.0:  # Zoom out
                    augmented_img = cv2.resize(img_2d, (new_w, new_h))
                    # Pad to original size
                    pad_h = (h - new_h) // 2
                    pad_w = (w - new_w) // 2
                    augmented_img = cv2.copyMakeBorder(
                        augmented_img, pad_h, h - new_h - pad_h, pad_w, w - new_w - pad_w,
                        cv2.BORDER_CONSTANT, value=0
                    )
                else:  # Zoom in
                    augmented_img = cv2.resize(img_2d, (new_w, new_h))
                    # Crop center to original size
                    start_h = (new_h - h) // 2
                    start_w = (new_w - w) // 2
                    augmented_img = augmented_img[start_h:start_h + h, start_w:start_w + w]
            
            elif augmentation_type == 'noise':
                # Add Gaussian noise
                mean = 0
                stddev = np.random.uniform(5, 15)  # For uint8 images
                noise = np.random.normal(mean, stddev, img_2d.shape).astype(np.float32)
                augmented_img = np.clip(img_2d.astype(np.float32) + noise, 0, 255).astype(np.uint8)
            
            elif augmentation_type == 'brightness':
                # Random brightness adjustment
                brightness_factor = np.random.uniform(0.8, 1.2)
                augmented_img = np.clip(img_2d.astype(np.float32) * brightness_factor, 0, 255).astype(np.uint8)
            
            # Add channel dimension back if needed
            if len(X_train.shape) == 4 and X_train.shape[-1] == 1:
                augmented_img = np.expand_dims(augmented_img, axis=-1)
            
            X_augmented.append(augmented_img)
            y_augmented.append(label)
    
    return np.array(X_augmented), np.array(y_augmented)


# Define paths - update these to match your BUSI dataset location
data_dir = "C:/Users/DragosTrandafiri/BreastCancer_CNN/data/raw/busi"   # This should contain benign, malignant, normal folders
output_dir = "C:/Users/DragosTrandafiri/BreastCancer_CNN/data/processed/busi"

# Create preprocessor
preprocessor = BUSIPreprocessor(data_dir, output_dir, img_size=(224, 224))

# Create dataset
X_train, X_val, y_train, y_val, filenames = preprocessor.create_dataset()

# Apply data augmentation
X_train_aug, y_train_aug = data_augmentation(X_train, y_train, augmentation_factor=3)

print(f"Original training set: {X_train.shape}, {y_train.shape}")
print(f"Augmented training set: {X_train_aug.shape}, {y_train_aug.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")

# Print class distribution
unique, counts = np.unique(y_train_aug, return_counts=True)
class_names = ['Normal', 'Benign', 'Malignant']
for i, (class_idx, count) in enumerate(zip(unique, counts)):
    print(f"Augmented {class_names[class_idx]}: {count} samples")

Found 210 images and 211 masks in malignant folder


Processing malignant images: 100%|███████████████████████████████████████████████████| 210/210 [00:02<00:00, 94.16it/s]
Processing malignant masks: 100%|███████████████████████████████████████████████████| 211/211 [00:00<00:00, 627.21it/s]


Found 133 images and 133 masks in normal folder


Processing normal images: 100%|██████████████████████████████████████████████████████| 133/133 [00:01<00:00, 85.00it/s]
Processing normal masks: 100%|██████████████████████████████████████████████████████| 133/133 [00:00<00:00, 277.40it/s]


Found 437 images and 454 masks in benign folder


Processing benign images: 100%|██████████████████████████████████████████████████████| 437/437 [00:04<00:00, 91.62it/s]
Processing benign masks: 100%|██████████████████████████████████████████████████████| 454/454 [00:00<00:00, 610.56it/s]


Dataset shape: (780, 224, 224, 1)
Labels shape: (780,)
Class distribution: Normal: 133, Benign: 437, Malignant: 210
Original training set: (624, 224, 224, 1), (624,)
Augmented training set: (2284, 224, 224, 1), (2284,)
Validation set: (156, 224, 224, 1), (156,)
Augmented Normal: 212 samples
Augmented Benign: 1400 samples
Augmented Malignant: 672 samples
