In [6]:
import matplotlib.pyplot as plt
import glob
from PIL import Image, ImageDraw, ImageFont
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import defaultdict

from sklearn.model_selection import train_test_split
import albumentations as A
from glob import glob # Used to easily find file paths


import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [2]:
col_names = ['REFNUM', 'BG', 'CLASS', 'SEVERITY', 'X', 'Y', 'RADIUS']
df = pd.read_csv('data2.txt', sep="\s+", names=col_names, header=None)
df['CANCER'] = df['SEVERITY'].apply(lambda x: 1 if x in ['B', 'M'] else 0)
df.head(5)

Unnamed: 0,REFNUM,BG,CLASS,SEVERITY,X,Y,RADIUS,CANCER
0,REFNUM,BG,CLASS,SEVERITY,X,Y,RADIUS,0
1,mdb001,G,CIRC,B,535,425,197,1
2,mdb002,G,CIRC,B,522,280,69,1
3,mdb003,D,NORM,,,,,0
4,mdb004,D,NORM,,,,,0


NameError: name 'img_value' is not defined

In [7]:
def data_labeling(img_files, txt_path):
    full_data = []
    cordinates = []
    # count_cancer = 0
    # img_coordinate = defaultdict(list)
    
    for i, filename in enumerate(sorted(os.listdir(img_files))): # Opens the image file and go throuth all the image 
        if filename.endswith(".pgm"): # display only if the image is pgm
            image_path = os.path.join(img_files, filename) # Getting the Image path EX => all-mias\mdb001.pgm
            text = txt_file[i].strip() # spliting the data in the text file 
            pairing = {"Image": image_path, "Text": text} # putting both image and the text in the dictinory 
            full_data.append(pairing) # adding all the data to the list 

    for pairing in full_data:
        txt_value = pairing['Text'] # ceperating the image with the text
        img_value = pairing['Image']
        
        img = cv2.imread(img_value) # creating the array
        txt_parts = txt_value.split() # spleting text in to multiple in array so to filter the data which they have the cordinates
        
        # Converting the image to GRAY and then to RGB and prepare for drawing
        img_gray = cv2.imread(img_value, cv2.IMREAD_GRAYSCALE)  
        img_rgb = cv2.cvtColor(img_gray, cv2.COLOR_GRAY2RGB)
       
        
        # Cheking if the text line contain the cordinate or not
        if len(txt_parts) == 7 and img is not None: 
            # Getting the cordinate for each image one by one 

            get_txt_data = txt_parts[4] + " " + txt_parts[5] + " " +txt_parts[6] # joining the X , Y , R
            x, y, r = map(int, get_txt_data.split())
            
            y_adj = 1024 - y
            cv2.circle(img_rgb, (x, y_adj), r, (0,255,0), 3)
            
            mask = np.zeros(img_rgb.shape[:2], dtype=np.uint8)
            cv2.circle(mask, (x, y_adj), r, 255, -1)
            roi = cv2.bitwise_and(img_rgb, img_rgb, mask=mask)

            # plt.imshow(img_rgb, cmap='gray')
            # plt.title("Example .pgm Image")
            # plt.axis('off')  # Hide axis ticks
            # plt.show()

# reading the Image file 
images_path = "all-mias"

# reading the txt file 
txt_path = "data2.txt"
with open(txt_path, "r") as file:
    txt_file = file.readlines()[1:]

data_labeling(images_path, txt_path)


In [None]:
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
TRAIN_DIR = 'dataset_for_training/train/'
VALIDATION_DIR = 'dataset_for_training/validation/'

# --- Step 1: Define the Albumentations Transform Pipeline ---
# This is where you define all your desired augmentations.
# These will be applied to the training images only.
# This is a powerful set of augmentations suitable for medical images.
train_transform = A.Compose([
    # Geometric transformations
    A.HorizontalFlip(p=0.5),
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=15, p=0.7),
    A.ElasticTransform(p=0.5, alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03),
    A.GridDistortion(p=0.5),
    
    # Brightness and contrast transformations
    A.RandomBrightnessContrast(p=0.7),
    A.CLAHE(p=0.8), # This is excellent for enhancing contrast in medical images
    
    # Noise and blur
    A.GaussNoise(p=0.5),
    A.Blur(blur_limit=3, p=0.5),
])

# For validation, we only need to resize and rescale. No random augmentation.
validation_transform = A.Compose([
    # Validation data should not be augmented randomly
])


# --- Step 2: Build the tf.data Pipeline (Replaces ImageDataGenerator) ---

# First, get all the file paths and their corresponding labels.
train_image_paths = glob(os.path.join(TRAIN_DIR, '*/*.png'))
validation_image_paths = glob(os.path.join(VALIDATION_DIR, '*/*.png'))

# Create labels from the folder names (0 for 'cancer', 1 for 'normal')
# Note: Keras's flow_from_directory sorts class names alphabetically.
# 'cancer' comes before 'normal', so Keras assigns it class 0.
train_labels = [0 if 'cancer' in path else 1 for path in train_image_paths]
validation_labels = [0 if 'cancer' in path else 1 for path in validation_image_paths]


def load_and_preprocess_image(image_path, label):
    """Loads, decodes, and resizes an image."""
    # Read the image file
    image = tf.io.read_file(image_path)
    # Decode to a tensor. We specify 3 channels as the model expects it.
    image = tf.io.decode_png(image, channels=3)
    # Resize the image to the target size
    image = tf.image.resize(image, [IMG_SIZE[0], IMG_SIZE[1]])
    # Rescale pixel values to [0, 1]
    image = image / 255.0
    return image, label

def apply_augmentations(image, label, transform):
    """A wrapper function to apply Albumentations transforms within TensorFlow."""
    def augment(img):
        aug_data = transform(image=img.numpy())
        return aug_data['image']

    # Use tf.py_function to run the python-based Albumentations library
    # The [image] and Tout=[tf.float32] define the input and output types.
    aug_image = tf.py_function(func=augment, inp=[image], Tout=tf.float32)
    # Make sure the output shape is set correctly
    aug_image.set_shape([IMG_SIZE[0], IMG_SIZE[1], 3])
    return aug_image, label


# Create the final training dataset object
train_dataset = tf.data.Dataset.from_tensor_slices((train_image_paths, train_labels))
train_dataset = (
    train_dataset.shuffle(buffer_size=len(train_image_paths)) # Shuffle the data
    .map(load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE) # Load and resize
    .map(lambda x, y: apply_augmentations(x, y, train_transform), num_parallel_calls=tf.data.AUTOTUNE) # Apply augmentations
    .batch(BATCH_SIZE) # Create batches
    .prefetch(buffer_size=tf.data.AUTOTUNE) # Pre-load the next batch for performance
)

# Create the final validation dataset object (without augmentation)
validation_dataset = tf.data.Dataset.from_tensor_slices((validation_image_paths, validation_labels))
validation_dataset = (
    validation_dataset
    .map(load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE) # Just load and resize
    .batch(BATCH_SIZE)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

# --- The rest of your code is now UNCHANGED ---

# --- 3. Build the Model with Transfer Learning (Unchanged) ---
base_model = tf.keras.applications.EfficientNetB0(weights='imagenet', include_top=False, input_shape=(*IMG_SIZE, 3))
base_model.trainable = False
x = base_model.output
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dropout(0.5)(x)
predictions = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(inputs=base_model.input, outputs=predictions)

# --- 4. Compile the Model (Unchanged) ---
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

# --- 5. Train the Model (Updated to use the new datasets) ---
print("Starting model training with Albumentations and tf.data pipeline...")
history = model.fit(
    train_dataset, # <-- Use the new training dataset
    epochs=20,
    validation_data=validation_dataset # <-- Use the new validation dataset
)

# --- 6. Save your trained model (Unchanged) ---
model.save('breast_cancer_classifier_v2_albumentations.h5')

  original_init(self, **validated_kwargs)
  A.ElasticTransform(p=0.5, alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03),


Starting model training with Albumentations and tf.data pipeline...
Epoch 1/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 1s/step - accuracy: 0.6468 - auc: 0.5298 - loss: 0.6788 - val_accuracy: 0.6406 - val_auc: 0.5000 - val_loss: 0.6640
Epoch 2/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 946ms/step - accuracy: 0.6411 - auc: 0.4516 - loss: 0.6632 - val_accuracy: 0.6406 - val_auc: 0.5000 - val_loss: 0.6548
Epoch 3/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 951ms/step - accuracy: 0.6500 - auc: 0.4595 - loss: 0.6524 - val_accuracy: 0.6406 - val_auc: 0.5000 - val_loss: 0.6528
Epoch 4/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 964ms/step - accuracy: 0.6511 - auc: 0.5338 - loss: 0.6456 - val_accuracy: 0.6406 - val_auc: 0.5000 - val_loss: 0.6530
Epoch 5/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 957ms/step - accuracy: 0.6453 - auc: 0.4211 - loss: 0.6550 - val_accuracy: 0.6406 - val_auc: 

In [1]:
# img = [].
# for file in img_files:
#     image = Image.open(file)
#     img.append(image)
# # images = [Image.open(file) for file in img_files]

# columns = 5
# rows = len(img) // columns + int(len(img) % columns !=0)

# fig, axes = plt.subplots(rows, columns, figsize=(15,15))
# axes = axes.flatten()

# for idx, image in enumerate(img):
#     axes[idx].imshow(image, cmap='gray')
#     axes[idx].axis('off')
#     axes[idx].set_title(f"Image {idx+1}")

# for ax in axes[len(img):]:
#     ax.axis('off')



# print(f"found {len(img_files)} .pgm images")
# plt.tight_layout()
# plt.show()
#####################################################################
# cols = 5
# rows = len(images) // cols + int(len(images) % cols != 0)

# # Create subplots
# # fig, axes = plt.subplots(rows, cols, figsize=(15, 15))
# fig, axes = plt.subplots(rows, cols, figsize=(15, 15))

# axes = axes.flatten()  # Flatten in case of 2D array of axes

# # Loop through images and plot each one
# for idx, img in enumerate(images):
#     axes[idx].imshow(img, cmap='gray')
#     axes[idx].axis('off')
#     axes[idx].set_title(f"Image {idx+1}")

# # Turn off any unused subplots
# for ax in axes[len(images):]:
#     ax.axis('off')

# plt.tight_layout()
# plt.show()

In [None]:
# # reading the Image file 
# img_files = glob.glob("all-mias")
# # img_files = "all-mias"
# print(f"found {len(img_files)} .pmg images")

# # reading the txt file 
# txt_path = "info.txt"
# with open(txt_path, "r") as file:
#     txt_file = file.readlines()

# first_img = Image.open(img_files[11])
# plt.imshow(first_img, cmap='gray')
# plt.title("Example .pgm Image")
# plt.axis('off')  # Hide axis ticks
# plt.show()

In [None]:
# def data_labeling(img_files, txt_path):
#     full_data = []
#     count_cancer = 0
#     img_coordinate = defaultdict(list)
#     for i, filename in enumerate(sorted(os.listdir(img_files))):
#         if filename.endswith(".pgm"):
#             image_path = os.path.join(img_files, filename)
#             text = txt_file[i].strip()

#             pairing = {"Image": image_path, "Text": text}
#             full_data.append(pairing)

#     for pairing in full_data:
#         txt_value = pairing['Text']
#         img_value = pairing['Image']
#         img = cv2.imread(img_value)
#         txt_parts = txt_value.split()
#         # print(f" {txt_value} {img_value}")
#         # cordinate = {}
        
#         if len(txt_parts) == 7 and img is not None:
#             count_cancer += 1
#             get_txt_data = txt_parts[4] + " - " + txt_parts[5] + " - " +txt_parts[6] + " - " +img_value[9:]
#             # print(" x      y    R")
#             # print(get_txt_data)
#             sorted_mdb = sorted(txt_value, key=lambda item: item[-1])
#             for x, y, r in sorted_mdb:
#                 y_adj = 1024 - y  # adjust y-coordinate
#                 cv2.circle(img_value, (x, y_adj), r, (0, 255, 0), 2)

#         if sorted_mdb:
#             x, y, r = sorted_mdb[0]
#             y_adj = 1024 - y
#             mask = np.zeros(img_value.shape[:2], dtype=np.uint8)
#             cv2.circle(mask, (x, y_adj), r, 255, -1)
#             roi = cv2.bitwise_and(img_value, img_value, mask=mask)


#         plt.imshow(img_rgb, cmap='gray')
#         plt.title("Example .pgm Image")
#         plt.axis('off')  # Hide axis ticks
#         plt.show()
        


# # reading the Image file 
# images_path = "all-mias"

# # reading the txt file 
# txt_path = "Info.txt"
# with open(txt_path, "r") as file:
#     txt_file = file.readlines()

# data_labeling(images_path, txt_path)
