In [None]:
import os
import random
import cv2
import matplotlib.pyplot as plt

In [None]:
def load_dataset(base_path):
    """
    Load image paths and corresponding YOLO annotation files.
    
    Args:
        base_path (str): Path to the parent folder containing 'benign_yolo' and 'malignant_yolo'.
        
    Returns:
        list of tuples: Each tuple contains (img_path, label_path, class_label).
    """
    data = []
    for class_folder in ['benign_yolo', 'malignant_yolo']:
        dir_path = os.path.join(base_path, class_folder)
        class_label = class_folder.split('_')[0]  # 'benign' or 'malignant'
        
        if not os.path.isdir(dir_path):
            continue
        
        for fname in os.listdir(dir_path):
            if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
                img_path = os.path.join(dir_path, fname)
                base, _ = os.path.splitext(img_path)
                if base.endswith('_denoised'):
                    base = base[: -len('_denoised')]
                txt_path = base + '.txt'
                if os.path.exists(txt_path):
                    data.append((img_path, txt_path, class_label))
    return data

def read_yolo_annotation(txt_path, img_shape):
    """
    Read a YOLO annotation file and convert to pixel coordinates.
    
    Args:
        txt_path (str): Path to the YOLO .txt file.
        img_shape (tuple): Shape of the image (height, width).
        
    Returns:
        tuple: (x, y, w, h) pixel coordinates of the bounding box.
    """
    h_img, w_img = img_shape[:2]
    boxes = []

    with open(txt_path, "r") as f:
        for line in f:                       # <- stay inside this loop
            line = line.strip()
            if not line:
                continue
            parts = line.split()
            if len(parts) != 5:
                continue

            # ↓ THESE FOUR LINES MUST BE INSIDE THE for-loop
            _, x_c, y_c, w, h = map(float, parts)
            x_c *= w_img;  y_c *= h_img
            w   *= w_img;  h *= h_img
            x1  = int(x_c - w / 2);  y1 = int(y_c - h / 2)
            boxes.append((x1, y1, int(w), int(h)))

    return boxes

def visualize_samples(data, n=5, seed=42):
    """
    Randomly sample and display images with bounding boxes.
    
    Args:
        data (list): List of (img_path, txt_path, class_label) tuples.
        n (int): Number of samples to display.
    """
    random.seed(seed)
    samples = random.sample(data, min(n, len(data)))
    for img_path, txt_path, label in samples:
        img = cv2.imread(img_path)
        if img is None:
            continue
        
        for (x, y, w, h) in read_yolo_annotation(txt_path, img.shape):
            cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.putText(img, label, (x, max(0, y - 10)),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

        plt.figure(figsize=(6, 6))
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        plt.axis("off")
        plt.show()

In [None]:
base_path = r'D:\datasets_in_D\Data_Science_for_Digital_Health\QAMEBI_CLAHE'
    
data = load_dataset(base_path)
print(f"Loaded {len(data)} annotated images.")
    
# Visualize a few samples (adjust n as needed)
visualize_samples(data, n=5)

In [None]:
def crop_and_save_images(data, output_path):
    cropped = []
    for img_path, txt_path, label in data:
        img = cv2.imread(img_path)
        if img is None:
            continue

        boxes = read_yolo_annotation(txt_path, img.shape)
        if not boxes:
            continue

        out_dir = os.path.join(output_path, label)
        os.makedirs(out_dir, exist_ok=True)
        base = os.path.splitext(os.path.basename(img_path))[0]

        for idx, (x, y, w, h) in enumerate(boxes):
            crop = img[y:y+h, x:x+w]
            if crop.size == 0:
                continue

            # ↓ THESE LINES BELONG INSIDE the for-loop
            crop_name = f"{base}_crop{idx}.png"
            crop_path = os.path.join(out_dir, crop_name)
            cv2.imwrite(crop_path, crop)
            cropped.append((crop_path, txt_path, label))

    return cropped

def visualize_crops(cropped, n=5, seed=42):
    """
    Randomly sample and display cropped images to verify.
    """
    random.seed(seed)
    samples = random.sample(cropped, min(n, len(cropped)))
    for img_path, txt_path, label in samples:
        img = cv2.imread(img_path)
        if img is None:
            continue
        for (x, y, w, h) in read_yolo_annotation(txt_path, img.shape):
            cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
            
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.figure(figsize=(6,6))
        plt.imshow(img_rgb)
        plt.title(label)
        plt.axis('off')
        plt.show()

In [None]:
output_path = r'D:\datasets_in_D\Data_Science_for_Digital_Health\QAMEBI_CLAHE\cropped_imgs'
       
# 1) crop & save
cropped = crop_and_save_images(data, output_path)
print(f"Cropped & saved {len(cropped)} images to '{output_path}'.")

# 2) visualize random crops
visualize_crops(cropped, n=5, seed=42)

In [None]:
def find_max_min_dims(cropped):
    """
    Compute the minimum width and height among all cropped images.
    """
    min_w, min_h = float('inf'), float('inf')
    max_w, max_h = 0, 0
    for crop_path, _, _ in cropped:
        img = cv2.imread(crop_path)
        if img is None:
            continue
        h, w = img.shape[:2]
        if w < min_w:
            min_w = w
        if h < min_h:
            min_h = h
        if w > max_w:
            max_w = w
        if h > max_h:
            max_h = h
    return int(min_w), int(min_h), int(max_w), int(max_h)

In [None]:
min_w, min_h, max_w, max_h = find_max_min_dims(cropped)
print(f"Minimum crop width: {min_w} pixels")
print(f"Minimum crop height: {min_h} pixels")
print(f"Maximum crop width: {max_w} pixels")
print(f"Maximum crop height: {max_h} pixels")

In [None]:
def pad_to_square(img):
    h, w = img.shape[:2]
    diff = abs(h - w)
    pad1, pad2 = diff // 2, diff - diff // 2
    if h < w:
        # pad top and bottom
        return cv2.copyMakeBorder(img, pad1, pad2, 0, 0,
                                  borderType=cv2.BORDER_CONSTANT,
                                  value=[0, 0, 0])
    else:
        # pad left and right
        return cv2.copyMakeBorder(img, 0, 0, pad1, pad2,
                                  borderType=cv2.BORDER_CONSTANT,
                                  value=[0, 0, 0])

def preprocess_crop(img, target_size=224):
    square = pad_to_square(img)
    return cv2.resize(square, (target_size, target_size), interpolation=cv2.INTER_AREA)

def visualize_actual_size_gray(samples, title):
    """
    Same as visualize_actual_size, but first convert to true grayscale
    and show with a gray colormap.
    """
    for info in samples:
        # load the original crop (still 3-channel BGR)
        orig_bgr = cv2.imread(info['path'])
        # convert both original & processed to gray
        orig_gray = cv2.cvtColor(orig_bgr, cv2.COLOR_BGR2GRAY)
        proc_bgr = preprocess_crop(orig_bgr)    # from earlier code
        proc_gray = cv2.cvtColor(proc_bgr, cv2.COLOR_BGR2GRAY)
        
        oh, ow = orig_gray.shape
        ph, pw = proc_gray.shape  # should be 224×224
        
        # figure size so 1px ≃ 1 displaypx
        fig = plt.figure(
            figsize=((ow + pw)/100, max(oh, ph)/100),
            dpi=100
        )
        gs = fig.add_gridspec(1, 2, width_ratios=[ow, pw])
        ax0 = fig.add_subplot(gs[0, 0])
        ax1 = fig.add_subplot(gs[0, 1])
        
        ax0.imshow(orig_gray, cmap='gray')
        ax0.set_title(f"{ow}×{oh}")
        ax0.axis('off');  ax0.set_aspect('equal')
        
        ax1.imshow(proc_gray, cmap='gray')
        ax1.set_title(f"{pw}×{ph}")
        ax1.axis('off');  ax1.set_aspect('equal')
        
        plt.suptitle(f"{title}: {os.path.basename(info['path'])}")
        plt.tight_layout()
        plt.show()

In [None]:
# 2) gather dimensions
crops_info = []
for path, _, label in cropped:
    img = cv2.imread(path)
    if img is None: continue
    h, w = img.shape[:2]
    crops_info.append({'path': path, 'label': label, 'h': h, 'w': w, 'max_dim': max(h, w)})

# sort by max dimension
sorted_crops = sorted(crops_info, key=lambda x: x['max_dim'])
smallest = sorted_crops[0]    # should be 40×40
largest  = sorted_crops[-1]   # max ~540×424

# seed for reproducibility
random.seed(42)

# select upsample (max_dim < 224)
up_list = [c for c in sorted_crops if c['max_dim'] < 224]
up_samples = [smallest] + random.sample([c for c in up_list if c != smallest], min(4, len(up_list)-1))

# select downscale (max_dim > 224)
down_list = [c for c in sorted_crops if c['max_dim'] > 224]
down_samples = [largest] + random.sample([c for c in down_list if c != largest], min(4, len(down_list)-1))

# 3) visualize and save
save_gray_path = r"D:\datasets_in_D\Data_Science_for_Digital_Health\QAMEBI_CLAHE\processed"
visualize_actual_size_gray(up_samples, 'Upsampled (<224)')
visualize_actual_size_gray(down_samples, 'Downscaled (>224)')

In [None]:
def save_preprocessed_grayscale_images(cropped_info, save_dir, target_size=224):
    """
    Preprocess (pad, resize, convert to grayscale) and save images
    in save_dir/<label>/ as 'label (#).png'.
    
    Args:
        cropped_info (list): List of dicts with 'path' and 'label'.
        save_dir (str): Base directory to save images.
        target_size (int): Output size for square image.
    """
    label_counters = {'benign': 1, 'malignant': 1}

    for info in cropped_info:
        img = cv2.imread(info['path'])
        if img is None:
            continue
        
        # Pad to square and resize
        processed = preprocess_crop(img, target_size=target_size)
        # Convert to grayscale
        gray = cv2.cvtColor(processed, cv2.COLOR_BGR2GRAY)
        
        label = info['label']
        label_folder = os.path.join(save_dir, label)
        os.makedirs(label_folder, exist_ok=True)
        
        # Generate filename like 'benign (1).png'
        count = label_counters[label]
        filename = f"{label} ({count}).png"
        full_path = os.path.join(label_folder, filename)
        
        # Save the grayscale image
        cv2.imwrite(full_path, gray)
        label_counters[label] += 1

In [None]:
save_preprocessed_grayscale_images(crops_info, save_gray_path, target_size=224)
print("Preprocessed grayscale images saved.")