In [1]:
import matplotlib.pyplot as plt
import glob
from PIL import Image, ImageDraw, ImageFont
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import defaultdict

from sklearn.model_selection import train_test_split
import albumentations as A
from glob import glob # Used to easily find file paths
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [6]:

df = pd.read_csv(
    'data2.txt',
    sep='\s+',
    names=['REFNUM','BG','CLASS','SEVERITY','X','Y','RADIUS'],
    na_values=['']
)
# Add the binary cancer label (1 if CLASS≠NORM, else 0)
df['CANCER'] = (df['CLASS'] != 'NORM').astype(int)
df['filepath'] = df['REFNUM'].apply(
    lambda id: f"all-mias/{id}.pgm"
)
df.head(5)

Unnamed: 0,REFNUM,BG,CLASS,SEVERITY,X,Y,RADIUS,CANCER,filepath
0,REFNUM,BG,CLASS,SEVERITY,X,Y,RADIUS,1,all-mias/REFNUM.pgm
1,mdb001,G,CIRC,B,535,425,197,1,all-mias/mdb001.pgm
2,mdb002,G,CIRC,B,522,280,69,1,all-mias/mdb002.pgm
3,mdb003,D,NORM,,,,,0,all-mias/mdb003.pgm
4,mdb004,D,NORM,,,,,0,all-mias/mdb004.pgm


In [8]:
def data_labeling(img_files_path, info_df, 
                  output_size=(224, 224),
                  default_patch_size=512):
    """
    Processes each .pgm in img_files_path:
      - If X/Y/RADIUS are valid → crops a square ROI around (X, Y) with side = 2*RADIUS
      - If missing coordinates        → crops a centered square patch of size default_patch_size
      - Resizes the crop to output_size and pairs it with its binary label.
    Returns:
      list of (image_array, label) tuples.
    """
    samples = []
    for filename in os.listdir(img_files_path):
        if not filename.lower().endswith(".pgm"):
            continue

        # 1) Find the metadata row for this image
        refnum = os.path.splitext(filename)[0]
        record = info_df[info_df['REFNUM'] == refnum]
        if record.empty:
            continue

        # 2) Load grayscale image
        path = os.path.join(img_files_path, filename)
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        h, w = img.shape

        # 3) Decide ROI vs fallback patch
        x, y, r = record['X'].iloc[0], record['Y'].iloc[0], record['RADIUS'].iloc[0]
        if pd.notna(x) and pd.notna(y) and pd.notna(r):
            # Convert (x, y) from bottom-left origin to NumPy row/col:
            cx = int(x)
            cy = int(h - y)
            radius = int(r)

            # Define square bounds, but clip to image edges
            x0 = max(cx - radius, 0)
            x1 = min(cx + radius, w)
            y0 = max(cy - radius, 0)
            y1 = min(cy + radius, h)
            crop = img[y0:y1, x0:x1]

        else:
            # Missing ROI → extract centered patch
            ps = default_patch_size // 2
            cx, cy = w // 2, h // 2
            x0 = max(cx - ps, 0)
            x1 = min(cx + ps, w)
            y0 = max(cy - ps, 0)
            y1 = min(cy + ps, h)
            crop = img[y0:y1, x0:x1]

        # 4) Resize to network input size
        resized = cv2.resize(crop, output_size, interpolation=cv2.INTER_AREA)

        # 5) Convert to 3-channel if your CNN expects color input (duplicate gray)
        img_input = cv2.cvtColor(resized, cv2.COLOR_GRAY2RGB)

        # 6) Get binary label (0/1)
        label = int(record['CANCER'].iloc[0])

        samples.append((img_input, label))

    return samples
