In [1]:
import matplotlib.pyplot as plt
import glob
from PIL import Image, ImageDraw, ImageFont
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import defaultdict

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit

import albumentations as A
from glob import glob # Used to easily find file paths
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [2]:
#the vsualization of the table for the dataset adding the cancer clumm and the image ref id 
df = pd.read_csv(
    'data2.txt',
    sep='\s+',
    names=['REFNUM','BG','CLASS','SEVERITY','X','Y','RADIUS'],
    na_values=['']
)
# Add the binary cancer label (1 if CLASS≠NORM, else 0)
df['CANCER'] = (df['CLASS'] != 'NORM').astype(int)
df['filepath'] = df['REFNUM'].apply(
    lambda id: f"all-mias/{id}.pgm"
)
df.head(5)

Unnamed: 0,REFNUM,BG,CLASS,SEVERITY,X,Y,RADIUS,CANCER,filepath
0,REFNUM,BG,CLASS,SEVERITY,X,Y,RADIUS,1,all-mias/REFNUM.pgm
1,mdb001,G,CIRC,B,535,425,197,1,all-mias/mdb001.pgm
2,mdb002,G,CIRC,B,522,280,69,1,all-mias/mdb002.pgm
3,mdb003,D,NORM,,,,,0,all-mias/mdb003.pgm
4,mdb004,D,NORM,,,,,0,all-mias/mdb004.pgm


In [3]:
# Full size mofel base on the ROI working on the full size image 
def build_fullsize_samples(img_dir, info_df):
    """
    Loads each .pgm as a full-size (e.g. 1024×1024) crop:
      - If ROI exists: crop exactly the ROI square (2*radius)
      - Else: use the entire image
    Then convert to 3-channel BGR and pair with label.
    """
    samples = []
    for filename in os.listdir(img_dir):
        if not filename.lower().endswith('.pgm'):
            continue

        # Lookup metadata
        refnum = os.path.splitext(filename)[0]
        row    = info_df[info_df['REFNUM'] == refnum]
        if row.empty:
            continue
        label = int(row['CANCER'].iloc[0])

        # Read gray image
        img = cv2.imread(os.path.join(img_dir, filename), cv2.IMREAD_GRAYSCALE)
        h, w = img.shape

        x, y, r = row['X'].iloc[0], row['Y'].iloc[0], row['RADIUS'].iloc[0]
        if pd.notna(x) and pd.notna(y) and pd.notna(r):
            # ROI crop
            cx, cy, radius = int(x), h - int(y), int(r)
            x0, x1 = max(cx-radius,0), min(cx+radius,w)
            y0, y1 = max(cy-radius,0), min(cy+radius,h)
            crop = img[y0:y1, x0:x1]
        else:
            # Full image
            crop = img

        # *** NO RESIZE STEP HERE ***

        # Convert to 3-channel BGR
        img_input = cv2.cvtColor(crop, cv2.COLOR_GRAY2BGR)

        samples.append((img_input, label, refnum))

    return samples


In [37]:
# Reduse in size so the model are more effecient and faster to run 
def build_samples(img_dir, info_df,
                  output_size=(224, 224),
                  fallback_size=512):
    """
    For each .pgm in img_dir, look up X/Y/RADIUS in info_df.
    - If X/Y/RADIUS are valid numbers: crop the square ROI around (X,Y) with side=2*RADIUS.
    - If any are NaN: crop a centered square fallback of side=fallback_size.
    Then resize the crop to output_size, convert to RGB, and pair with cancer label.
    Returns: list of (image_array, label) tuples.
    """
    samples = []

    # Loop through every file in the directory
    for filename in os.listdir(img_dir):
        if not filename.lower().endswith('.pgm'):
            continue

        # 1) Load metadata row for this image
        refnum = os.path.splitext(filename)[0]      # e.g. 'mdb001'
        row   = info_df[info_df['REFNUM'] == refnum]
        if row.empty:
            continue
        label = int(row['CANCER'].iloc[0])         # 0 or 1

        # 2) Read the grayscale image
        path = os.path.join(img_dir, filename)
        img  = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        h, w = img.shape                           # should be 1024×1024

        # 3) Extract ROI if available
        x, y, r = row['X'].iloc[0], row['Y'].iloc[0], row['RADIUS'].iloc[0]

        if pd.notna(x) and pd.notna(y) and pd.notna(r):
            # --- VALID ROI PATH ---
            # Convert (x, y) from bottom-left origin to NumPy row/col:
            cx     = int(x)
            cy     = h - int(y)
            radius = int(r)

            # Define square bounds around the circle
            x0 = max(cx - radius, 0)
            x1 = min(cx + radius, w)
            y0 = max(cy - radius, 0)
            y1 = min(cy + radius, h)

            crop = img[y0:y1, x0:x1]

        else:
            # --- MISSING ROI PATH ---
            # Center of image
            cx, cy = w // 2, h // 2
            half   = fallback_size // 2

            x0 = max(cx - half, 0)
            x1 = min(cx + half, w)
            y0 = max(cy - half, 0)
            y1 = min(cy + half, h)

            crop = img[y0:y1, x0:x1]

        # 4) Resize everything to CNN input size
        resized = cv2.resize(crop, output_size, interpolation=cv2.INTER_AREA)

        # 5) Convert to 3-channel (if using a pre-trained RGB model)
        img_input = cv2.cvtColor(resized, cv2.COLOR_GRAY2BGR)

        # 6) Store image + label
        samples.append((img_input, label, refnum))

    return samples


In [4]:
def build_samples_uniform(img_dir, info_df,
                          output_size=(1024, 1024)):  # choose full size here
    samples = []
    for filename in os.listdir(img_dir):
        if not filename.lower().endswith('.pgm'):
            continue

        refnum = os.path.splitext(filename)[0]
        row    = info_df[info_df['REFNUM'] == refnum]
        if row.empty:
            continue
        label = int(row['CANCER'].iloc[0])

        # load full grayscale
        img = cv2.imread(os.path.join(img_dir, filename), cv2.IMREAD_GRAYSCALE)

        # **Ignore ROI entirely**; use whole image
        crop = img

        # **Always** resize
        resized = cv2.resize(crop, output_size, interpolation=cv2.INTER_AREA)

        # to 3-ch
        img_input = cv2.cvtColor(resized, cv2.COLOR_GRAY2BGR)

        samples.append((img_input, label, refnum))
    return samples


In [6]:

import math
def refnum_p_id(refnum):
    num = int(refnum.replace('mdb', ''))
    return math.ceil(num / 2) - 1
    
img_dir = 'all-mias'
info_df = df

# # Use it:
samples = build_samples_uniform(img_dir, info_df)

# Now unpack:
images, labels, refnums = zip(*samples)

# Compute group for each refnum:
groups = [refnum_p_id(r) for r in refnums]


In [7]:

splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(splitter.split(images, labels, groups=groups))

# Build your final train/val lists
X_train = [images[i] for i in train_idx]
y_train = [labels[i] for i in train_idx]
X_val   = [images[i] for i in val_idx]
y_val   = [labels[i] for i in val_idx]


In [8]:
class MammogramSequence(tf.keras.utils.Sequence):
    def __init__(self, images, labels, batch_size=16, augment=False):
        """
        images: list of NumPy arrays, one per sample
        labels: list/array of 0/1 labels
        batch_size: how many samples per batch
        augment: whether to apply random transforms
        """
        self.images   = images
        self.labels   = np.array(labels)
        self.batch_size = batch_size
        self.augment    = augment
        self.indices = np.arange(len(images))

    def __len__(self):
        # how many batches in one epoch?
        return int(np.ceil(len(self.images) / self.batch_size))

    def __getitem__(self, idx):
        """
        Called by Keras during training/validation.
        idx: batch index [0 .. __len__()-1]
        """
        # 1) select the slice of indices for this batch
        batch_idxs = self.indices[idx*self.batch_size : (idx+1)*self.batch_size]

        batch_x = []
        for i in batch_idxs:
            img = self.images[i].astype('float32') / 255.0  # normalize
            if self.augment:
                # Apply random flips
                img = tf.image.random_flip_left_right(img)
                img = tf.image.random_flip_up_down(img)
                # Random brightness/contrast
                img = tf.image.random_brightness(img, 0.1)
                img = tf.image.random_contrast(img, 0.1, 0.2)
            batch_x.append(img)

        # 2) stack into one array of shape (batch_size, H, W, C)
        batch_x = np.stack(batch_x, axis=0)

        # 3) select corresponding labels
        batch_y = self.labels[batch_idxs]

        return batch_x, batch_y

    def on_epoch_end(self):
        # Shuffle your data at the end of each epoch (if training)
        if self.augment:
            np.random.shuffle(self.indices)


train_seq = MammogramSequence(X_train, y_train, batch_size=16, augment=True)
val_seq   = MammogramSequence(X_val,   y_val,   batch_size=16, augment=False)


In [10]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import layers, models, optimizers

# 1) Load the base model (without its top), freeze it initially
base = ResNet50(include_top=False,
                weights='imagenet',
                input_shape=(1024,1024,3))
base.trainable = False

# 2) Add a custom head
x = layers.GlobalAveragePooling2D()(base.output)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.5)(x)

output = layers.Dense(1, activation='sigmoid')(x)

model = models.Model(inputs=base.input, outputs=output)

# 3) Compile with appropriate loss & metrics
model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]
)


In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=5, restore_best_weights=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.1, patience=3
    )
]

# Stage 1: only the head
history1 = model.fit(
    train_seq,
    validation_data=val_seq,
    epochs=20,
    callbacks=callbacks
)

# Stage 2: unfreeze some of the base for fine-tuning
base.trainable = True
# Optionally freeze most layers, only fine-tune last block(s)
for layer in base.layers[:-10]:
    layer.trainable = False

model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-5),
    loss='binary_crossentropy',
    metrics=['accuracy','precision','recall']
)

history2 = model.fit(
    train_seq,
    validation_data=val_seq,
    epochs=10,
    callbacks=callbacks
)


Epoch 1/20


  self._warn_if_super_not_called()


[1m 4/17[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m4:02[0m 19s/step - accuracy: 0.6992 - loss: 1.5445 - precision: 0.0714 - recall: 0.0278        

In [None]:
# 1) Predictions on validation set
y_prob = model.predict(val_seq)
y_pred = (y_prob > 0.5).astype(int).flatten()
y_true = np.array(y_val)

# 2) Classification report
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

print(classification_report(y_true, y_pred,
    target_names=['No Cancer','Cancer']))
print("ROC AUC:", roc_auc_score(y_true, y_prob))

# 3) Confusion matrix
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

## The Diffrence between 244 img size and full size which is 1024

In [45]:
# 1) Predictions on validation set
y_prob = model.predict(val_seq)
y_pred = (y_prob > 0.5).astype(int).flatten()
y_true = np.array(y_val)

# 2) Classification report
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

print(classification_report(y_true, y_pred,
    target_names=['No Cancer','Cancer']))
print("ROC AUC:", roc_auc_score(y_true, y_prob))

# 3) Confusion matrix
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1s/step
              precision    recall  f1-score   support

   No Cancer       0.64      1.00      0.78        42
      Cancer       0.00      0.00      0.00        24

    accuracy                           0.64        66
   macro avg       0.32      0.50      0.39        66
weighted avg       0.40      0.64      0.49        66

ROC AUC: 0.7341269841269842
Confusion Matrix:
 [[42  0]
 [24  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
