In [16]:
import matplotlib.pyplot as plt
import glob
from PIL import Image, ImageDraw, ImageFont
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import defaultdict

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit

import albumentations as A
from glob import glob # Used to easily find file paths
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [6]:

df = pd.read_csv(
    'data2.txt',
    sep='\s+',
    names=['REFNUM','BG','CLASS','SEVERITY','X','Y','RADIUS'],
    na_values=['']
)
# Add the binary cancer label (1 if CLASS≠NORM, else 0)
df['CANCER'] = (df['CLASS'] != 'NORM').astype(int)
df['filepath'] = df['REFNUM'].apply(
    lambda id: f"all-mias/{id}.pgm"
)
df.head(5)

Unnamed: 0,REFNUM,BG,CLASS,SEVERITY,X,Y,RADIUS,CANCER,filepath
0,REFNUM,BG,CLASS,SEVERITY,X,Y,RADIUS,1,all-mias/REFNUM.pgm
1,mdb001,G,CIRC,B,535,425,197,1,all-mias/mdb001.pgm
2,mdb002,G,CIRC,B,522,280,69,1,all-mias/mdb002.pgm
3,mdb003,D,NORM,,,,,0,all-mias/mdb003.pgm
4,mdb004,D,NORM,,,,,0,all-mias/mdb004.pgm


In [14]:
def build_fullsize_samples(img_dir, info_df):
    """
    Loads each .pgm as a full-size (e.g. 1024×1024) crop:
      - If ROI exists: crop exactly the ROI square (2*radius)
      - Else: use the entire image
    Then convert to 3-channel BGR and pair with label.
    """
    samples = []
    for filename in os.listdir(img_dir):
        if not filename.lower().endswith('.pgm'):
            continue

        # Lookup metadata
        refnum = os.path.splitext(filename)[0]
        row    = info_df[info_df['REFNUM'] == refnum]
        if row.empty:
            continue
        label = int(row['CANCER'].iloc[0])

        # Read gray image
        img = cv2.imread(os.path.join(img_dir, filename), cv2.IMREAD_GRAYSCALE)
        h, w = img.shape

        x, y, r = row['X'].iloc[0], row['Y'].iloc[0], row['RADIUS'].iloc[0]
        if pd.notna(x) and pd.notna(y) and pd.notna(r):
            # ROI crop
            cx, cy, radius = int(x), h - int(y), int(r)
            x0, x1 = max(cx-radius,0), min(cx+radius,w)
            y0, y1 = max(cy-radius,0), min(cy+radius,h)
            crop = img[y0:y1, x0:x1]
        else:
            # Full image
            crop = img

        # *** NO RESIZE STEP HERE ***

        # Convert to 3-channel BGR
        img_input = cv2.cvtColor(crop, cv2.COLOR_GRAY2BGR)

        samples.append((img_input, label))

    return samples


In [18]:
def build_samples(img_dir, info_df,
                  output_size=(224, 224),
                  fallback_size=512):
    """
    For each .pgm in img_dir, look up X/Y/RADIUS in info_df.
    - If X/Y/RADIUS are valid numbers: crop the square ROI around (X,Y) with side=2*RADIUS.
    - If any are NaN: crop a centered square fallback of side=fallback_size.
    Then resize the crop to output_size, convert to RGB, and pair with cancer label.
    Returns: list of (image_array, label) tuples.
    """
    samples = []

    # Loop through every file in the directory
    for filename in os.listdir(img_dir):
        if not filename.lower().endswith('.pgm'):
            continue

        # 1) Load metadata row for this image
        refnum = os.path.splitext(filename)[0]      # e.g. 'mdb001'
        row   = info_df[info_df['REFNUM'] == refnum]
        if row.empty:
            continue
        label = int(row['CANCER'].iloc[0])         # 0 or 1

        # 2) Read the grayscale image
        path = os.path.join(img_dir, filename)
        img  = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        h, w = img.shape                           # should be 1024×1024

        # 3) Extract ROI if available
        x, y, r = row['X'].iloc[0], row['Y'].iloc[0], row['RADIUS'].iloc[0]

        if pd.notna(x) and pd.notna(y) and pd.notna(r):
            # --- VALID ROI PATH ---
            # Convert (x, y) from bottom-left origin to NumPy row/col:
            cx     = int(x)
            cy     = h - int(y)
            radius = int(r)

            # Define square bounds around the circle
            x0 = max(cx - radius, 0)
            x1 = min(cx + radius, w)
            y0 = max(cy - radius, 0)
            y1 = min(cy + radius, h)

            crop = img[y0:y1, x0:x1]

        else:
            # --- MISSING ROI PATH ---
            # Center of image
            cx, cy = w // 2, h // 2
            half   = fallback_size // 2

            x0 = max(cx - half, 0)
            x1 = min(cx + half, w)
            y0 = max(cy - half, 0)
            y1 = min(cy + half, h)

            crop = img[y0:y1, x0:x1]

        # 4) Resize everything to CNN input size
        resized = cv2.resize(crop, output_size, interpolation=cv2.INTER_AREA)

        # 5) Convert to 3-channel (if using a pre-trained RGB model)
        img_input = cv2.cvtColor(resized, cv2.COLOR_GRAY2BGR)

        # 6) Store image + label
        samples.append((img_input, label))

    return samples


In [20]:
# Suppose you built two parallel lists:
samples = build_samples(img_dir, info_df)
images, labels = zip(*samples)
# Construct an array of patient IDs: [1,1,2,2,3,3,...]
patient_ids = [i//2 for i in range(len(images))]

splitter = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, val_idx = next(splitter.split(images, labels, groups=patient_ids))

# Index into your lists
X_train = [images[i] for i in train_idx]
y_train = [labels[i] for i in train_idx]
X_val   = [images[i] for i in val_idx]
y_val   = [labels[i] for i in val_idx]


NameError: name 'img_dir' is not defined