Deep Feature Extraction from trained DL Model

In [None]:

from tensorflow.keras.models import load_model, Model
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
import os
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Function to print with Markdown
def printmd(string):
    display(Markdown(string))


# Load and preprocess dataset
image_dir = Path(r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\dataset')

filepaths = list(image_dir.glob(r'**/*.png'))
labels = list(map(lambda x: os.path.split(os.path.split(x)[0])[1], filepaths))

filepaths = pd.Series(filepaths, name='Filepath').astype(str)
labels = pd.Series(labels, name='Label')

image_df = pd.concat([filepaths, labels], axis=1)
image_df = image_df.sample(frac=1).reset_index(drop=True)

# Split data into training, validation, and test sets
train_df, temp_df = train_test_split(image_df, train_size=0.7, shuffle=True, random_state=1)
val_df, test_df = train_test_split(temp_df, train_size=0.5, shuffle=True, random_state=1)

# Create data generators with augmentation
def create_gen():
    train_generator = ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input,
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    val_test_generator = ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
    )

    train_images = train_generator.flow_from_dataframe(
        dataframe=train_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=32,
        shuffle=True,
        seed=0
    )

    val_images = val_test_generator.flow_from_dataframe(
        dataframe=val_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=32,
        shuffle=False
    )

    test_images = val_test_generator.flow_from_dataframe(
        dataframe=test_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=32,
        shuffle=False
    )

    return train_images, val_images, test_images


# Load the pre-trained model
model_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\Mobilenetv2_finetuned_on_breastcancer_dataset.h5'
loaded_model = load_model(model_path)

# Define the feature extraction model
# Adjust the layer as needed; here, we use the fourth-last layer's output
feature_extractor = Model(inputs=loaded_model.input, outputs=loaded_model.layers[-4].output)

# Directory to save extracted features
feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\extracted_features'
os.makedirs(feature_dir, exist_ok=True)

def extract_features(data_gen, set_name):
    features = []
    labels = []
    
    # Loop through batches in the data generator
    for batch_images, batch_labels in data_gen:
        # Extract features for the batch
        batch_features = feature_extractor.predict(batch_images)
        features.extend(batch_features)
        labels.extend(batch_labels)
        
        # Break if weâ€™ve covered all images in the generator
        if data_gen.batch_index == 0:
            break

    # Convert to numpy arrays
    features = np.array(features)
    labels = np.array(labels)
    
    # Save extracted features and labels to a file
    with open(os.path.join(feature_dir, f"{set_name}_features.pkl"), 'wb') as f:
        pickle.dump((features, labels), f)

# Create data generators with augmentation
train_images, val_images, test_images = create_gen()

# Extract features for training, validation, and test sets
extract_features(train_images, "train")
extract_features(val_images, "val")
extract_features(test_images, "test")

print("Features extracted and saved.")


Classification of features using saved extracted features

In [None]:
# # Import necessary libraries
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle
import os
import numpy as np

# Directory to save extracted features and models
feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\extracted_features'
model_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\saved_models'
os.makedirs(model_dir, exist_ok=True)

# Load extracted features
with open(os.path.join(feature_dir, "train_features.pkl"), 'rb') as f:
    train_features, train_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "val_features.pkl"), 'rb') as f:
    val_features, val_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "test_features.pkl"), 'rb') as f:
    test_features, test_labels = pickle.load(f)

# Convert one-hot encoded labels to single-class labels
train_labels = np.argmax(train_labels, axis=1)
val_labels = np.argmax(val_labels, axis=1)
test_labels = np.argmax(test_labels, axis=1)

# Train Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_features, train_labels)

# Save the trained Decision Tree model
tree_model_path = os.path.join(model_dir, 'decision_tree_model.pkl')
with open(tree_model_path, 'wb') as f:
    pickle.dump(clf, f)

# Evaluate on training, validation, and test sets
for set_name, features, labels in [("Train", train_features, train_labels), ("Validation", val_features, val_labels), ("Test", test_features, test_labels)]:
    predictions = clf.predict(features)
    accuracy = accuracy_score(labels, predictions)
    print(f"{set_name} Accuracy: {accuracy * 100:.2f}%")
    print(f"{set_name} Confusion Matrix:\n", confusion_matrix(labels, predictions))
    print(f"{set_name} Classification Report:\n", classification_report(labels, predictions))

print(f"Decision Tree model saved to: {tree_model_path}")


Classification using Rulefit on the extracted deep features

In [None]:
import os
import pickle
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix

# Directory to save extracted features and models
feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\extracted_features'
model_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\saved_models'
os.makedirs(model_dir, exist_ok=True)

# Load extracted features
with open(os.path.join(feature_dir, "train_features.pkl"), 'rb') as f:
    train_features, train_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "val_features.pkl"), 'rb') as f:
    val_features, val_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "test_features.pkl"), 'rb') as f:
    test_features, test_labels = pickle.load(f)

# Check shapes of features and labels
print("Train Features Shape:", train_features.shape)
print("Train Labels Shape:", train_labels.shape)
print("Validation Features Shape:", val_features.shape)
print("Validation Labels Shape:", val_labels.shape)

# Ensure labels are aligned with features
assert train_features.shape[0] == train_labels.shape[0], "Mismatch between train features and labels."
assert val_features.shape[0] == val_labels.shape[0], "Mismatch between validation features and labels."

# Convert multi-dimensional labels to single class labels (if needed)
train_labels = np.argmax(train_labels, axis=1)  # Assuming one-hot encoding
val_labels = np.argmax(val_labels, axis=1)      # Assuming one-hot encoding
test_labels = np.argmax(test_labels, axis=1)    # If test labels are also one-hot encoded

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=1.0, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding or using a threshold
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)
print(f"{set_name} Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions_discrete))
print(f"{set_name} Classification Report:\n", classification_report(test_labels, test_predictions_discrete))

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\rulefit_rules_on_deep_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")
