In [1]:
import cv2
import numpy as np
from scipy import ndimage

In [2]:
# Run the notebook to load all its classes and functions
%run busi_preprocessing_1.ipynb

# Data augmentation

## Augmentation Details

1. Per-class augmentation factor:

Normal (label=0): 5x augmentation

Malignant (label=2): 3x augmentation

Benign (label=1): 1x augmentation

2. Augmentation types:

flip: Horizontal flip

rotate: Random rotation between -15° and 15°

zoom: Random zoom in/out (0.9–1.1 scale), with padding or cropping

noise: Gaussian noise addition

brightness: Random brightness adjustment (0.9–1.1 factor)

3. Mask handling:

If masks are provided, corresponding augmentations are applied identically to the mask.

Masks maintain binary/segmentation structure (except for zoom, rotation, or flip).

4. Channel dimension handling:

Images and masks with a single channel (shape[-1]=1) are correctly expanded after augmentation.

-------------------------------------------------------------------------------------------------------------------------------------------------------

Returns X_augmented, y_augmented, mask_augmented , if masks are provided, else X_augmented, y_augmented

In [3]:
def data_augmentation(X, y, masks=None):
    """
    Apply data augmentation to the training set (supports optional masks).
    
    Args:
        X_train: Training images
        y_train: Training labels (0=Normal, 1=Benign, 2=Malignant)
        masks: Optional masks (None if classification only)
        
    Returns:
        Augmented images, labels, (and masks if provided)
    """
    X_augmented, y_augmented = [], []
    mask_augmented = [] if masks is not None else None

    for i in range(len(X)):
        img = X[i]
        label = y[i]
        mask = masks[i] if masks is not None else None

        # Define per-class augmentation strength
        if label == 0:   # Normal (minority)
            aug_factor = 5
        elif label == 2: # Malignant (mid-size)
            aug_factor = 3
        else:            # Benign (majority)
            aug_factor = 1

        # Always keep original
        X_augmented.append(img)
        y_augmented.append(label)
        if mask is not None:
            mask_augmented.append(mask)

        # Augmentations
        for _ in range(aug_factor):
            aug_type = np.random.choice(['flip', 'rotate', 'zoom', 'noise', 'brightness'])

            img_2d = img.squeeze(-1) if img.ndim == 3 and img.shape[-1] == 1 else img
            mask_2d = mask.squeeze(-1) if (mask is not None and mask.ndim == 3 and mask.shape[-1] == 1) else mask

            if aug_type == 'flip':
                aug_img = cv2.flip(img_2d, 1)
                aug_mask = cv2.flip(mask_2d, 1) if mask is not None else None

            elif aug_type == 'rotate':
                angle = np.random.uniform(-15, 15)
                aug_img = ndimage.rotate(img_2d, angle, reshape=False)
                aug_mask = ndimage.rotate(mask_2d, angle, reshape=False) if mask is not None else None

            elif aug_type == 'zoom':
                zoom_factor = np.random.uniform(0.9, 1.1)
                h, w = img_2d.shape
                new_h, new_w = int(h * zoom_factor), int(w * zoom_factor)
                aug_img = cv2.resize(img_2d, (new_w, new_h))
                aug_mask = cv2.resize(mask_2d, (new_w, new_h)) if mask is not None else None

                if zoom_factor < 1.0:  # pad
                    pad_h = (h - new_h) // 2
                    pad_w = (w - new_w) // 2
                    aug_img = cv2.copyMakeBorder(aug_img, pad_h, h - new_h - pad_h,
                                                 pad_w, w - new_w - pad_w,
                                                 cv2.BORDER_CONSTANT, value=0)
                    if mask is not None:
                        aug_mask = cv2.copyMakeBorder(aug_mask, pad_h, h - new_h - pad_h,
                                                      pad_w, w - new_w - pad_w,
                                                      cv2.BORDER_CONSTANT, value=0)
                else:  # crop
                    start_h = (new_h - h) // 2
                    start_w = (new_w - w) // 2
                    aug_img = aug_img[start_h:start_h + h, start_w:start_w + w]
                    if mask is not None:
                        aug_mask = aug_mask[start_h:start_h + h, start_w:start_w + w]

            elif aug_type == 'noise':
                noise = np.random.normal(0, np.random.uniform(3, 8), img_2d.shape)
                aug_img = np.clip(img_2d.astype(np.float32) + noise, 0, 255).astype(np.uint8)
                aug_mask = mask_2d

            elif aug_type == 'brightness':
                factor = np.random.uniform(0.9, 1.1)
                aug_img = np.clip(img_2d.astype(np.float32) * factor, 0, 255).astype(np.uint8)
                aug_mask = mask_2d

            # Restore channel dim
            if img.ndim == 3 and img.shape[-1] == 1:
                aug_img = np.expand_dims(aug_img, axis=-1)
                if aug_mask is not None:
                    aug_mask = np.expand_dims(aug_mask, axis=-1)

            X_augmented.append(aug_img)
            y_augmented.append(label)
            if mask is not None:
                mask_augmented.append(aug_mask)

    if masks is not None:
        return np.array(X_augmented), np.array(y_augmented), np.array(mask_augmented)
    else:
        return np.array(X_augmented), np.array(y_augmented)


# Apply data augmentation & creation of the dataset

In [4]:
data_dir = "C:/Users/DragosTrandafiri/BreastCancer_CNN/data/raw/busi"   # benign, malignant, normal folders
output_dir = "C:/Users/DragosTrandafiri/BreastCancer_CNN/data/processed/busi"

# Create preprocessor
preprocessor = BUSIPreprocessor(data_dir, output_dir, img_size=(224, 224))

#for class_name in ['malignant', 'normal', 'benign']:
# Create dataset
X, y, masks = preprocessor.create_dataset()

from sklearn.model_selection import train_test_split

#First split -> train + temp (val + test)
X_train, X_temp, y_train, y_temp, masks_train, masks_temp = train_test_split(
    X, y, masks,
    test_size=0.3,  # 30% goes to val + test
    stratify=y,
    random_state=42
)

# Split temp -> val + test
X_val, X_test, y_val, y_test, masks_val, masks_test = train_test_split(
    X_temp, y_temp, masks_temp,
    test_size=0.5,  # Half of 30% → 15% val, 15% test
    stratify=y_temp,
    random_state=42
)

# Apply data augmentation with masks
X_aug, y_aug, masks_aug = data_augmentation(X_train, y_train, masks_train)

print(f"Training set sizes: X_train: {X_train.shape}, y_train: {y_train.shape}, masks: {masks.shape}")
print(f"Augmented training set sizes: X_aug: {X_aug.shape}, y_aug: {y_aug.shape}, masks: {masks_aug.shape}")
print(f"Validation set sizes: X_val: {X_val.shape}, y_val: {y_val.shape}, masks: {masks_val.shape}")
print(f"Test set sizes: X_test: {X_test.shape}, y_test: {y_test.shape}, masks: {masks_test.shape}")

# Print class distribution
unique, counts = np.unique(y_aug, return_counts=True)
class_names = ['Normal', 'Benign', 'Malignant']
for i, (class_idx, count) in enumerate(zip(unique, counts)):
    print(f"Augmented {class_names[class_idx]}: {count} samples")

Found 210 images and 211 masks in malignant folder


Processing malignant images: 100%|███████████████████████████████████████████████████| 210/210 [00:02<00:00, 87.28it/s]
Processing malignant masks: 100%|███████████████████████████████████████████████████| 211/211 [00:00<00:00, 505.44it/s]


Found 133 images and 133 masks in normal folder


Processing normal images: 100%|██████████████████████████████████████████████████████| 133/133 [00:01<00:00, 86.41it/s]
Processing normal masks: 100%|██████████████████████████████████████████████████████| 133/133 [00:00<00:00, 244.44it/s]


Found 437 images and 454 masks in benign folder


Processing benign images: 100%|██████████████████████████████████████████████████████| 437/437 [00:04<00:00, 91.38it/s]
Processing benign masks: 100%|██████████████████████████████████████████████████████| 454/454 [00:00<00:00, 467.53it/s]


Combined 2 masks for malignant (53).png
Combined 2 masks for benign (100).png
Combined 2 masks for benign (163).png
Combined 2 masks for benign (173).png
Combined 2 masks for benign (181).png
Combined 3 masks for benign (195).png
Combined 2 masks for benign (25).png
Combined 2 masks for benign (315).png
Combined 2 masks for benign (346).png
Combined 2 masks for benign (4).png
Combined 2 masks for benign (424).png
Combined 2 masks for benign (54).png
Combined 2 masks for benign (58).png
Combined 2 masks for benign (83).png
Combined 2 masks for benign (92).png
Combined 2 masks for benign (93).png
Combined 2 masks for benign (98).png
Dataset shape: (780, 224, 224, 1)
Masks shape: (780, 224, 224, 1)
Labels shape: (780,)
Class distribution: Normal: 133, Benign: 437, Malignant: 210
Training set sizes: (546, 224, 224, 1), (546,), masks: (780, 224, 224, 1)
Augmented training set sizes: (1758, 224, 224, 1), (1758,), masks: (1758, 224, 224, 1)
Validation set sizes: (117, 224, 224, 1), (117,), ma

In [5]:
# Define output directory for augmented images
augmented_dir_X = "C:/Users/DragosTrandafiri/BreastCancer_CNN/data/processed/busi/processed_images_augmented/X"
augmented_dir_Mask = "C:/Users/DragosTrandafiri/BreastCancer_CNN/data/processed/busi/processed_images_augmented/Mask"

# Class names mapping
class_names = ['Normal', 'Benign', 'Malignant']

# Create directories if they don’t exist
for class_name in class_names:
    os.makedirs(os.path.join(augmented_dir_X, class_name), exist_ok=True)
    os.makedirs(os.path.join(augmented_dir_Mask, class_name), exist_ok=True)

# Save images
for i, (img_X, img_Mask, label) in enumerate(zip(X_aug, masks_aug, y_aug)):
    # Convert from float [0,1] to uint8 [0,255] if needed
    if img_X.max() <= 1.0:
        img_to_save_X = (img_X * 255).astype(np.uint8)
    else:
        img_to_save_X = img_X.astype(np.uint8)

    if img_Mask.max() <= 1.0:
        img_to_save_mask = (img_Mask * 255).astype(np.uint8)
    else:
        img_to_save_mask = img_Mask.astype(np.uint8)

    # Ensure channel order is correct (OpenCV uses BGR)
    if img_to_save_X.shape[-1] == 3:  
        img_to_save_X = cv2.cvtColor(img_to_save_X, cv2.COLOR_RGB2BGR)

    if img_to_save_mask.shape[-1] == 3:  
        img_to_save_mask = cv2.cvtColor(img_to_save_mask, cv2.COLOR_RGB2BGR)
    

    # Build filename
    filename = f"aug_{i:05d}.png"
    filepath_x = os.path.join(augmented_dir_X, class_names[label], filename)

    # Save
    cv2.imwrite(filepath_x, img_to_save_X)


    filename = f"aug_mask_{i:05d}.png"
    filepath_mask = os.path.join(augmented_dir_Mask, class_names[label], filename)

    # Save
    cv2.imwrite(filepath_mask, img_to_save_mask)

print(f"Augmented images saved to: {augmented_dir_X}")

Augmented images saved to: C:/Users/DragosTrandafiri/BreastCancer_CNN/data/processed/busi/processed_images_augmented/X
