HOG Features

In [None]:
import os
import numpy as np
import pandas as pd
from skimage.io import imread
from skimage.color import rgb2gray
from skimage.feature import hog
from sklearn.model_selection import train_test_split
from skimage.transform import resize

# Paths
dataset_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\dataset"
output_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\featuresHOG"

# Create output path if it does not exist
os.makedirs(output_path, exist_ok=True)

# Label mapping
label_mapping = {'benign': 0, 'malignant': 1}  # All lowercase for matching

# Function to preprocess images and handle different formats
def preprocess_image(image, target_size=(128, 128)):
    if len(image.shape) == 3:  # RGB or RGBA image
        if image.shape[2] == 4:  # RGBA image
            image = image[:, :, :3]  # Discard the alpha channel
        image = rgb2gray(image)  # Convert to grayscale
    elif len(image.shape) != 2:  # Invalid shape
        raise ValueError("Image must be 2D (grayscale) or 3D (RGB/RGBA)")

    # Resize to smaller size to save memory
    image = resize(image, target_size, anti_aliasing=True)
    return image

# Function to calculate HOG features
def calculate_hog_features(image):
    return hog(image, pixels_per_cell=(8, 8), cells_per_block=(2, 2), orientations=9, feature_vector=True)

# Process dataset and extract HOG features
def process_hog_features(dataset_path):
    data = []
    temp_file_path = os.path.join(output_path, 'features_temp.csv')
    
    # If the temporary file exists, remove it to avoid conflicts
    if os.path.exists(temp_file_path):
        os.remove(temp_file_path)

    for label in os.listdir(dataset_path):
        label_path = os.path.join(dataset_path, label)
        if not os.path.isdir(label_path):
            continue

        # Map the label to its corresponding numeric value
        label_mapped = label_mapping.get(label.lower(), -1)  # Convert to lowercase for comparison
        if label_mapped == -1:
            print(f"Skipping folder with unexpected label: {label}")
            continue

        for file_name in os.listdir(label_path):
            file_path = os.path.join(label_path, file_name)
            if not file_name.lower().endswith(('png', 'jpg', 'jpeg')):
                continue

            try:
                image = imread(file_path)
                grayscale = preprocess_image(image)
                hog_features = calculate_hog_features(grayscale)

                features = {f'hog_{i}': val for i, val in enumerate(hog_features)}
                features['label'] = label_mapped  # Add the label here
                data.append(features)
                
                # Save features incrementally to avoid holding everything in memory
                if len(data) >= 1000:  # Save every 1000 images
                    temp_df = pd.DataFrame(data)
                    temp_df.to_csv(temp_file_path, mode='a', header=not os.path.exists(temp_file_path), index=False)
                    data = []  # Reset the data list

            except Exception as e:
                print(f"Skipping {file_name} due to error: {e}")

    # Save remaining data after loop
    if data:
        temp_df = pd.DataFrame(data)
        temp_df.to_csv(temp_file_path, mode='a', header=not os.path.exists(temp_file_path), index=False)

    # Return the DataFrame with all extracted features
    if os.path.exists(temp_file_path):
        return pd.read_csv(temp_file_path)
    else:
        print("No features were extracted. Please check the dataset.")
        return pd.DataFrame()

# Extract features and split dataset
features_df = process_hog_features(dataset_path)
if features_df.empty:
    print("No features were extracted. Please check the dataset.")
else:
    # Ensure 'label' is present and split the dataset correctly
    if 'label' not in features_df.columns:
        print("Error: 'label' column not found in the features dataframe.")
    else:
        X = features_df.drop('label', axis=1)
        y = features_df['label']
        X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.7, stratify=y, random_state=1)
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, train_size=0.5, stratify=y_temp, random_state=1)

        # Save features
        pd.concat([X_train, y_train], axis=1).to_csv(os.path.join(output_path, 'train.csv'), index=False)
        pd.concat([X_val, y_val], axis=1).to_csv(os.path.join(output_path, 'val.csv'), index=False)
        pd.concat([X_test, y_test], axis=1).to_csv(os.path.join(output_path, 'test.csv'), index=False)

        print(f"Features saved to {output_path}")
        print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")


Classification of HOG features using decision tree 

In [None]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

# Paths to the combined feature files
combined_features_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\featuresHOG"

# Load the combined train, val, and test CSV files
train_df = pd.read_csv(os.path.join(combined_features_path, "train.csv"))
val_df = pd.read_csv(os.path.join(combined_features_path, "val.csv"))
test_df = pd.read_csv(os.path.join(combined_features_path, "test.csv"))

# Combine the train and val datasets
combined_train_val_df = pd.concat([train_df, val_df], ignore_index=True)

# Split the features (X) and labels (y) for the combined training and validation set
X_train_val = combined_train_val_df.drop('label', axis=1)  # Features
y_train_val = combined_train_val_df['label']  # Labels

# Split the features (X) and labels (y) for the test set
X_test = test_df.drop('label', axis=1)  # Features
y_test = test_df['label']  # Labels

# ---- Grid Search for Hyperparameter Optimization ----
# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=1)

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, 
                           scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train_val, y_train_val)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Optimized Hyperparameters:")
print(best_params)

# Get the best estimator from Grid Search
best_dt_classifier = grid_search.best_estimator_

# ---- Evaluate the Optimized Decision Tree ----
# Training Evaluation
y_train_pred = best_dt_classifier.predict(X_train_val)

# Training Confusion Matrix
train_cm = confusion_matrix(y_train_val, y_train_pred)
print("\nTraining Confusion Matrix:")
print(train_cm)

# Plot Training Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(train_cm, annot=True, fmt='d', cmap='Greens', 
            xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], 
            yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Training Confusion Matrix')
plt.show()

# Training Classification Report
print("\nTraining Classification Report:")
print(classification_report(y_train_val, y_train_pred, digits=4))

# Training Accuracy
train_accuracy = accuracy_score(y_train_val, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Testing Evaluation
y_test_pred = best_dt_classifier.predict(X_test)

# Testing Confusion Matrix
test_cm = confusion_matrix(y_test, y_test_pred)
print("\nTesting Confusion Matrix:")
print(test_cm)

# Plot Testing Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(test_cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], 
            yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Testing Confusion Matrix')
plt.show()

# Testing Classification Report
print("\nTesting Classification Report:")
print(classification_report(y_test, y_test_pred, digits=4))

# Testing Accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Testing Accuracy: {test_accuracy:.4f}")


HU moments features extraction 

In [None]:
import os
import pandas as pd
from skimage.io import imread
from skimage.color import rgb2gray
from skimage.measure import moments_hu
from sklearn.model_selection import train_test_split

# Paths
dataset_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\dataset"
output_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\HuMoments"

# Label mapping (ensure folder names match exactly as in the dataset)
label_mapping = {'benign': 0, 'malignant': 1}  # All lowercase for matching

# Function to preprocess images and handle different formats
def preprocess_image(image):
    if len(image.shape) == 3:  # RGB or RGBA image
        if image.shape[2] == 4:  # RGBA image
            image = image[:, :, :3]  # Discard the alpha channel
        image = rgb2gray(image)  # Convert to grayscale
    elif len(image.shape) != 2:  # Invalid shape
        raise ValueError("Image must be 2D (grayscale) or 3D (RGB/RGBA)")
    return image

# Function to calculate Hu moments
def calculate_hu_moments(image):
    return moments_hu(image)

# Process dataset and extract Hu moments
def process_hu_features(dataset_path):
    data = []
    for label in os.listdir(dataset_path):
        label_path = os.path.join(dataset_path, label)
        if not os.path.isdir(label_path):
            continue

        label_mapped = label_mapping.get(label, -1)  # Use exact folder name match
        if label_mapped == -1:
            print(f"Skipping folder with unexpected label: {label}")
            continue

        for file_name in os.listdir(label_path):
            file_path = os.path.join(label_path, file_name)
            if not file_name.lower().endswith(('png', 'jpg', 'jpeg')):
                continue

            try:
                image = imread(file_path)
                grayscale = preprocess_image(image)
                hu_features = calculate_hu_moments(grayscale)

                features = {f'hu_moment_{i}': val for i, val in enumerate(hu_features)}
                features['label'] = label_mapped
                data.append(features)
            except Exception as e:
                print(f"Skipping {file_name} due to error: {e}")

    return pd.DataFrame(data)

# Extract features and split dataset
features_df = process_hu_features(dataset_path)
if features_df.empty:
    print("No features were extracted. Please check the dataset.")
else:
    X = features_df.drop('label', axis=1)
    y = features_df['label']
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.7, stratify=y, random_state=1)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, train_size=0.5, stratify=y_temp, random_state=1)

    # Save features
    os.makedirs(output_path, exist_ok=True)
    pd.concat([X_train, y_train], axis=1).to_csv(os.path.join(output_path, 'train.csv'), index=False)
    pd.concat([X_val, y_val], axis=1).to_csv(os.path.join(output_path, 'val.csv'), index=False)
    pd.concat([X_test, y_test], axis=1).to_csv(os.path.join(output_path, 'test.csv'), index=False)

    print(f"Features saved to {output_path}")
    print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")


Classification of Hu moments features using decision tree

In [None]:


import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

# Paths to the combined feature files
combined_features_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\HuMoments"

# Load the combined train, val, and test CSV files
train_df = pd.read_csv(os.path.join(combined_features_path, "train.csv"))
val_df = pd.read_csv(os.path.join(combined_features_path, "val.csv"))
test_df = pd.read_csv(os.path.join(combined_features_path, "test.csv"))

# Combine the train and val datasets
combined_train_val_df = pd.concat([train_df, val_df], ignore_index=True)

# Split the features (X) and labels (y) for the combined training and validation set
X_train_val = combined_train_val_df.drop('label', axis=1)  # Features
y_train_val = combined_train_val_df['label']  # Labels

# Split the features (X) and labels (y) for the test set
X_test = test_df.drop('label', axis=1)  # Features
y_test = test_df['label']  # Labels

# ---- Grid Search for Hyperparameter Optimization ----
# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=1)

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, 
                           scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train_val, y_train_val)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Optimized Hyperparameters:")
print(best_params)

# Get the best estimator from Grid Search
best_dt_classifier = grid_search.best_estimator_

# ---- Evaluate the Optimized Decision Tree ----
# Training Evaluation
y_train_pred = best_dt_classifier.predict(X_train_val)

# Training Confusion Matrix
train_cm = confusion_matrix(y_train_val, y_train_pred)
print("\nTraining Confusion Matrix:")
print(train_cm)

# Plot Training Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(train_cm, annot=True, fmt='d', cmap='Greens', 
            xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], 
            yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Training Confusion Matrix')
plt.show()

# Training Classification Report
print("\nTraining Classification Report:")
print(classification_report(y_train_val, y_train_pred, digits=4))

# Training Accuracy
train_accuracy = accuracy_score(y_train_val, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Testing Evaluation
y_test_pred = best_dt_classifier.predict(X_test)

# Testing Confusion Matrix
test_cm = confusion_matrix(y_test, y_test_pred)
print("\nTesting Confusion Matrix:")
print(test_cm)

# Plot Testing Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(test_cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], 
            yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Testing Confusion Matrix')
plt.show()

# Testing Classification Report
print("\nTesting Classification Report:")
print(classification_report(y_test, y_test_pred, digits=4))

# Testing Accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Testing Accuracy: {test_accuracy:.4f}")


Color histogram features extraction

In [None]:
import os
import numpy as np
import pandas as pd
from skimage.io import imread
from skimage.color import rgb2hsv
from sklearn.model_selection import train_test_split

# Paths
dataset_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\dataset"
output_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\ColorHistogram"

# Label mapping (ensure folder names match exactly as in the dataset)
label_mapping = {'benign': 0, 'malignant': 1}  # All lowercase for matching

# Function to preprocess images
def preprocess_image(image):
    if len(image.shape) == 3:  # RGB or RGBA image
        if image.shape[2] == 4:  # RGBA image
            image = image[:, :, :3]  # Discard the alpha channel
    elif len(image.shape) != 3 or image.shape[2] != 3:  # Invalid shape for RGB image
        raise ValueError("Image must be RGB or RGBA format")
    return image

# Function to calculate color histogram features
def calculate_color_histogram(image, bins=10):
    hsv_image = rgb2hsv(image)
    features = {}
    for i, channel in enumerate(['hue', 'saturation', 'value']):
        hist, _ = np.histogram(hsv_image[:, :, i], bins=bins, range=(0, 1))
        features[f'{channel}_mean'] = np.mean(hist)
        features[f'{channel}_std'] = np.std(hist)
    return features

# Process dataset and extract color histogram features
def process_color_hist_features(dataset_path, bins=10):
    data = []
    for label in os.listdir(dataset_path):
        label_path = os.path.join(dataset_path, label)
        if not os.path.isdir(label_path):
            continue

        label_mapped = label_mapping.get(label, -1)  # Use exact folder name match
        if label_mapped == -1:
            print(f"Skipping folder with unexpected label: {label}")
            continue

        for file_name in os.listdir(label_path):
            file_path = os.path.join(label_path, file_name)
            if not file_name.lower().endswith(('png', 'jpg', 'jpeg')):
                continue

            try:
                image = imread(file_path)
                image = preprocess_image(image)
                features = calculate_color_histogram(image, bins=bins)
                features['label'] = label_mapped
                data.append(features)
            except Exception as e:
                print(f"Skipping {file_name} due to error: {e}")

    return pd.DataFrame(data)

# Extract features and split dataset
features_df = process_color_hist_features(dataset_path, bins=10)
if features_df.empty:
    print("No features were extracted. Please check the dataset.")
else:
    X = features_df.drop('label', axis=1)
    y = features_df['label']
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.7, stratify=y, random_state=1)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, train_size=0.5, stratify=y_temp, random_state=1)

    # Save features
    os.makedirs(output_path, exist_ok=True)
    pd.concat([X_train, y_train], axis=1).to_csv(os.path.join(output_path, 'train.csv'), index=False)
    pd.concat([X_val, y_val], axis=1).to_csv(os.path.join(output_path, 'val.csv'), index=False)
    pd.concat([X_test, y_test], axis=1).to_csv(os.path.join(output_path, 'test.csv'), index=False)

    print(f"Features saved to {output_path}")
    print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")


Classification of color histogram features using decision tree

In [None]:

import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

# Paths to the combined feature files
combined_features_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\ColorHistogram"

# Load the combined train, val, and test CSV files
train_df = pd.read_csv(os.path.join(combined_features_path, "train.csv"))
val_df = pd.read_csv(os.path.join(combined_features_path, "val.csv"))
test_df = pd.read_csv(os.path.join(combined_features_path, "test.csv"))

# Combine the train and val datasets
combined_train_val_df = pd.concat([train_df, val_df], ignore_index=True)

# Split the features (X) and labels (y) for the combined training and validation set
X_train_val = combined_train_val_df.drop('label', axis=1)  # Features
y_train_val = combined_train_val_df['label']  # Labels

# Split the features (X) and labels (y) for the test set
X_test = test_df.drop('label', axis=1)  # Features
y_test = test_df['label']  # Labels

# ---- Grid Search for Hyperparameter Optimization ----
# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=1)

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, 
                           scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train_val, y_train_val)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Optimized Hyperparameters:")
print(best_params)

# Get the best estimator from Grid Search
best_dt_classifier = grid_search.best_estimator_

# ---- Evaluate the Optimized Decision Tree ----
# Training Evaluation
y_train_pred = best_dt_classifier.predict(X_train_val)

# Training Confusion Matrix
train_cm = confusion_matrix(y_train_val, y_train_pred)
print("\nTraining Confusion Matrix:")
print(train_cm)

# Plot Training Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(train_cm, annot=True, fmt='d', cmap='Greens', 
            xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], 
            yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Training Confusion Matrix')
plt.show()

# Training Classification Report
print("\nTraining Classification Report:")
print(classification_report(y_train_val, y_train_pred, digits=4))

# Training Accuracy
train_accuracy = accuracy_score(y_train_val, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Testing Evaluation
y_test_pred = best_dt_classifier.predict(X_test)

# Testing Confusion Matrix
test_cm = confusion_matrix(y_test, y_test_pred)
print("\nTesting Confusion Matrix:")
print(test_cm)

# Plot Testing Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(test_cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], 
            yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Testing Confusion Matrix')
plt.show()

# Testing Classification Report
print("\nTesting Classification Report:")
print(classification_report(y_test, y_test_pred, digits=4))

# Testing Accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Testing Accuracy: {test_accuracy:.4f}")


USing training set to train, validation set to validate, and testing set to test model

Frequency-Domain Features Extraction

In [None]:
import os
import numpy as np
import pandas as pd
from skimage.io import imread
from skimage.color import rgb2gray
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import skew, kurtosis
from scipy.fftpack import fft2, fftshift
import pywt

# Dataset paths
dataset_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\dataset"
output_folder_base = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\FrequencyFeatures"

# Label mapping
label_mapping = {'benign': 0, 'malignant': 1}  # All lowercase for matching

# Function to preprocess the image
def preprocess_image(image):
    if len(image.shape) == 3:  # Convert RGB or RGBA to grayscale
        if image.shape[2] == 4:  # If the image has an alpha channel (RGBA)
            image = image[:, :, :3]  # Discard the alpha channel
        image = rgb2gray(image)  # Convert to grayscale
    return image

# Function to calculate frequency features
def calculate_frequency_features(image):
    # Fourier Transform features
    f_transform = fft2(image)
    f_transform_shifted = fftshift(f_transform)
    magnitude_spectrum = np.abs(f_transform_shifted)
    norm_magnitude_spectrum = magnitude_spectrum / (np.sum(magnitude_spectrum) + 1e-6)

    features = {
        'peak_frequency': np.max(magnitude_spectrum),
        'total_energy': np.sum(magnitude_spectrum ** 2),
        'mean_frequency': np.mean(magnitude_spectrum),
        'std_frequency': np.std(magnitude_spectrum),
        'skew_frequency': skew(magnitude_spectrum.ravel()),
        'kurtosis_frequency': kurtosis(magnitude_spectrum.ravel()),
        'entropy_frequency': -np.sum(norm_magnitude_spectrum * np.log(norm_magnitude_spectrum + 1e-6)),
        'bandwidth': np.sum(magnitude_spectrum > (0.5 * np.max(magnitude_spectrum))),
    }

    # Wavelet Transform features
    coeffs = pywt.dwt2(image, 'haar')
    cA, (cH, cV, cD) = coeffs
    wavelet_features = {
        'wavelet_energy': np.sum(cA ** 2),
        'wavelet_entropy': -np.sum(cA / (np.sum(cA) + 1e-6) * np.log(cA / (np.sum(cA) + 1e-6) + 1e-6)),
    }

    features.update(wavelet_features)
    return features

# Function to extract and save frequency features
def extract_and_save_frequency_features(dataset_path, output_folder):
    data = []
    for label in os.listdir(dataset_path):
        label_path = os.path.join(dataset_path, label)
        if not os.path.isdir(label_path):
            continue

        label_mapped = label_mapping.get(label.lower(), -1)
        if label_mapped == -1:
            print(f"Skipping unexpected label: {label}")
            continue

        for file_name in os.listdir(label_path):
            file_path = os.path.join(label_path, file_name)
            if not file_name.lower().endswith(('png', 'jpg', 'jpeg')):
                continue

            try:
                image = imread(file_path)
                image = preprocess_image(image)
                features = calculate_frequency_features(image)
                features['label'] = label_mapped
                data.append(features)
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

    if not data:
        print("No features extracted.")
        return

    save_features_to_csv(data, output_folder, "Frequency Features")

# Function to save features to CSV files
def save_features_to_csv(data, output_folder, feature_type):
    features_df = pd.DataFrame(data)
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features_df.drop('label', axis=1))
    scaled_df = pd.DataFrame(scaled_features, columns=features_df.columns[:-1])
    scaled_df['label'] = features_df['label']

    # Split data into train, validation, and test sets
    train_df, temp_df = train_test_split(scaled_df, train_size=0.7, stratify=scaled_df['label'], random_state=1)
    val_df, test_df = train_test_split(temp_df, train_size=0.5, stratify=temp_df['label'], random_state=1)

    # Save datasets
    feature_folder = os.path.join(output_folder, feature_type.replace(" ", "_"))
    os.makedirs(feature_folder, exist_ok=True)
    train_df.to_csv(os.path.join(feature_folder, "train.csv"), index=False)
    val_df.to_csv(os.path.join(feature_folder, "val.csv"), index=False)
    test_df.to_csv(os.path.join(feature_folder, "test.csv"), index=False)

    print(f"Features saved to {feature_folder}")
    print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# Main execution
output_folder = os.path.join(output_folder_base, "Frequency_Features")
extract_and_save_frequency_features(dataset_path, output_folder)


Classification of frequency domain features using decision tree

In [None]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

# Paths to the combined feature files
combined_features_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\FrequencyFeatures"

# Load the combined train, val, and test CSV files
train_df = pd.read_csv(os.path.join(combined_features_path, "train.csv"))
val_df = pd.read_csv(os.path.join(combined_features_path, "val.csv"))
test_df = pd.read_csv(os.path.join(combined_features_path, "test.csv"))

# Combine the train and val datasets
combined_train_val_df = pd.concat([train_df, val_df], ignore_index=True)

# Split the features (X) and labels (y) for the combined training and validation set
X_train_val = combined_train_val_df.drop('label', axis=1)  # Features
y_train_val = combined_train_val_df['label']  # Labels

# Split the features (X) and labels (y) for the test set
X_test = test_df.drop('label', axis=1)  # Features
y_test = test_df['label']  # Labels

# ---- Grid Search for Hyperparameter Optimization ----
# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=1)

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, 
                           scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train_val, y_train_val)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Optimized Hyperparameters:")
print(best_params)

# Get the best estimator from Grid Search
best_dt_classifier = grid_search.best_estimator_

# ---- Save Final Features ----
final_train_val_path = os.path.join(combined_features_path, "final_train_val.csv")
final_test_path = os.path.join(combined_features_path, "final_test.csv")

# Save the training-validation and test features
combined_train_val_df.to_csv(final_train_val_path, index=False)
test_df.to_csv(final_test_path, index=False)

print(f"Final train-validation features saved to: {final_train_val_path}")
print(f"Final test features saved to: {final_test_path}")

# ---- Evaluate the Optimized Decision Tree ----
# Training Evaluation
y_train_pred = best_dt_classifier.predict(X_train_val)

# Training Confusion Matrix
train_cm = confusion_matrix(y_train_val, y_train_pred)
print("\nTraining Confusion Matrix:")
print(train_cm)

# Plot Training Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(train_cm, annot=True, fmt='d', cmap='Greens', 
            xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], 
            yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Training Confusion Matrix')
plt.show()

# Training Classification Report
print("\nTraining Classification Report:")
print(classification_report(y_train_val, y_train_pred, digits=4))

# Training Accuracy
train_accuracy = accuracy_score(y_train_val, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Testing Evaluation
y_test_pred = best_dt_classifier.predict(X_test)

# Testing Confusion Matrix
test_cm = confusion_matrix(y_test, y_test_pred)
print("\nTesting Confusion Matrix:")
print(test_cm)

# Plot Testing Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(test_cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], 
            yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Testing Confusion Matrix')
plt.show()

# Testing Classification Report
print("\nTesting Classification Report:")
print(classification_report(y_test, y_test_pred, digits=4))

# Testing Accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Testing Accuracy: {test_accuracy:.4f}")


Local Binary Pattern (LBP) Features Extraction

In [None]:
import os
import numpy as np
from skimage.feature import local_binary_pattern
from skimage.io import imread
from skimage.color import rgb2gray
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

# Dataset paths
dataset_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\dataset"
output_folder = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\LBP_Features"

# Label mapping
label_mapping = {'benign': 0, 'malignant': 1}  # All lowercase for matching

# Function to calculate LBP features
def calculate_lbp_features(image):
    # Local Binary Pattern (LBP) with P=8, R=1
    lbp = local_binary_pattern(image, P=8, R=1, method='uniform')
    # Calculate LBP histogram
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 11), range=(0, 10))
    # Normalize the histogram
    lbp_hist = lbp_hist.astype(np.float32)
    lbp_hist /= (lbp_hist.sum() + 1e-6)
    # Return features as a dictionary
    return dict(zip([f'lbp_{i}' for i in range(len(lbp_hist))], lbp_hist))

# Function to extract and save LBP features
def extract_and_save_lbp_features(dataset_path, output_folder):
    data = []
    for label in os.listdir(dataset_path):
        label_path = os.path.join(dataset_path, label)
        if not os.path.isdir(label_path):
            continue
        
        # Map label to corresponding numeric value
        label_mapped = label_mapping.get(label.lower(), -1)
        if label_mapped == -1:
            print(f"Skipping unexpected label: {label}")
            continue

        for file_name in os.listdir(label_path):
            file_path = os.path.join(label_path, file_name)
            if not file_name.lower().endswith(('png', 'jpg', 'jpeg')):  # Ignore non-image files
                continue

            try:
                # Read image
                image = imread(file_path)
                
                # Handle RGBA images (4 channels)
                if image.shape[-1] == 4:  
                    image = image[:, :, :3]  # Drop the alpha channel (keep only RGB)
                
                # Convert to grayscale if the image is RGB
                if len(image.shape) == 3 and image.shape[-1] == 3:  
                    image = rgb2gray(image)
                
                # Validate that image is 2D (grayscale)
                if len(image.shape) != 2:
                    raise ValueError(f"Invalid image shape {image.shape} for file: {file_name}")
                
                # Extract LBP features
                features = calculate_lbp_features(image)
                # Add label to features
                features['label'] = label_mapped
                data.append(features)
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

    if not data:
        print("No features extracted.")
        return

    # Save features to CSV
    save_features_to_csv(data, output_folder)

# Function to save features to CSV
def save_features_to_csv(data, output_folder):
    # Convert data to a DataFrame
    features_df = pd.DataFrame(data)
    # Standardize features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features_df.drop('label', axis=1))
    scaled_df = pd.DataFrame(scaled_features, columns=features_df.columns[:-1])
    scaled_df['label'] = features_df['label']

    # Split into train, validation, and test sets
    train_df, temp_df = train_test_split(scaled_df, train_size=0.7, shuffle=True, random_state=1, stratify=scaled_df['label'])
    val_df, test_df = train_test_split(temp_df, train_size=0.5, shuffle=True, random_state=1, stratify=temp_df['label'])

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Save CSV files directly to the specified folder
    train_df.to_csv(os.path.join(output_folder, "train.csv"), index=False)
    val_df.to_csv(os.path.join(output_folder, "val.csv"), index=False)
    test_df.to_csv(os.path.join(output_folder, "test.csv"), index=False)

    print(f"Features saved to {output_folder}")
    print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# Extract and save LBP features
extract_and_save_lbp_features(dataset_path, output_folder)


Classification of LBP using decision tree

In [None]:

import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

# Paths to the combined feature files
combined_features_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\LBP_Features"

# Load the combined train, val, and test CSV files
train_df = pd.read_csv(os.path.join(combined_features_path, "train.csv"))
val_df = pd.read_csv(os.path.join(combined_features_path, "val.csv"))
test_df = pd.read_csv(os.path.join(combined_features_path, "test.csv"))

# Combine the train and val datasets
combined_train_val_df = pd.concat([train_df, val_df], ignore_index=True)

# Split the features (X) and labels (y) for the combined training and validation set
X_train_val = combined_train_val_df.drop('label', axis=1)  # Features
y_train_val = combined_train_val_df['label']  # Labels

# Split the features (X) and labels (y) for the test set
X_test = test_df.drop('label', axis=1)  # Features
y_test = test_df['label']  # Labels

# ---- Grid Search for Hyperparameter Optimization ----
# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=1)

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, 
                           scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train_val, y_train_val)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Optimized Hyperparameters:")
print(best_params)

# Get the best estimator from Grid Search
best_dt_classifier = grid_search.best_estimator_

# ---- Evaluate the Optimized Decision Tree ----
# Training Evaluation
y_train_pred = best_dt_classifier.predict(X_train_val)

# Training Confusion Matrix
train_cm = confusion_matrix(y_train_val, y_train_pred)
print("\nTraining Confusion Matrix:")
print(train_cm)

# Plot Training Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(train_cm, annot=True, fmt='d', cmap='Greens', 
            xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], 
            yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Training Confusion Matrix')
plt.show()

# Training Classification Report
print("\nTraining Classification Report:")
print(classification_report(y_train_val, y_train_pred, digits=4))

# Training Accuracy
train_accuracy = accuracy_score(y_train_val, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Testing Evaluation
y_test_pred = best_dt_classifier.predict(X_test)

# Testing Confusion Matrix
test_cm = confusion_matrix(y_test, y_test_pred)
print("\nTesting Confusion Matrix:")
print(test_cm)

# Plot Testing Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(test_cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], 
            yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Testing Confusion Matrix')
plt.show()

# Testing Classification Report
print("\nTesting Classification Report:")
print(classification_report(y_test, y_test_pred, digits=4))

# Testing Accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Testing Accuracy: {test_accuracy:.4f}")


Gabor Features Extraction

In [None]:
import os
import numpy as np
from skimage.filters import gabor
from skimage.io import imread
from skimage.color import rgb2gray
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

# Dataset paths
dataset_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\dataset"
output_folder = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\GaborFeatures"

# Label mapping
label_mapping = {'benign': 0, 'malignant': 1}  # All lowercase for matching

# Function to preprocess the image
def preprocess_image(image):
    if len(image.shape) == 3:  # If the image has 3 channels (RGB or RGBA)
        if image.shape[2] == 4:  # If the image has 4 channels (RGBA)
            image = image[:, :, :3]  # Discard the alpha channel and keep RGB channels
        image = rgb2gray(image)  # Convert to grayscale
    return image

# Function to calculate Gabor features
def calculate_gabor_features(image):
    frequencies = [0.1, 0.3, 0.5]
    orientations = [0, np.pi/4, np.pi/2, 3*np.pi/4]
    gabor_features = {}

    for freq in frequencies:
        for angle in orientations:
            # Apply Gabor filter
            real, _ = gabor(image, frequency=freq, theta=angle)
            # Extract statistical features
            gabor_features[f'gabor_{freq:.2f}_{angle:.2f}_mean'] = np.mean(real)
            gabor_features[f'gabor_{freq:.2f}_{angle:.2f}_std'] = np.std(real)

    return gabor_features

# Function to extract and save Gabor features
def extract_and_save_gabor_features(dataset_path, output_folder):
    data = []
    for label in os.listdir(dataset_path):
        label_path = os.path.join(dataset_path, label)
        if not os.path.isdir(label_path):
            continue

        # Map label to numeric value
        label_mapped = label_mapping.get(label.lower(), -1)
        if label_mapped == -1:
            print(f"Skipping unexpected label: {label}")
            continue

        for file_name in os.listdir(label_path):
            file_path = os.path.join(label_path, file_name)
            try:
                # Read image
                image = imread(file_path)
                # Preprocess the image (convert to grayscale if RGB or RGBA)
                image = preprocess_image(image)
                # Extract Gabor features
                features = calculate_gabor_features(image)
                # Add label to features
                features['label'] = label_mapped
                data.append(features)
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

    if not data:
        print("No features extracted.")
        return

    # Save features to CSV
    save_features_to_csv(data, output_folder)

# Function to save features to CSV
def save_features_to_csv(data, output_folder):
    # Convert data to DataFrame
    features_df = pd.DataFrame(data)
    # Standardize features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features_df.drop('label', axis=1))
    scaled_df = pd.DataFrame(scaled_features, columns=features_df.columns[:-1])
    scaled_df['label'] = features_df['label']

    # Split into train, validation, and test sets
    train_df, temp_df = train_test_split(scaled_df, train_size=0.7, shuffle=True, random_state=1, stratify=scaled_df['label'])
    val_df, test_df = train_test_split(temp_df, train_size=0.5, shuffle=True, random_state=1, stratify=temp_df['label'])

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Save CSV files directly to the specified folder
    train_df.to_csv(os.path.join(output_folder, "train.csv"), index=False)
    val_df.to_csv(os.path.join(output_folder, "val.csv"), index=False)
    test_df.to_csv(os.path.join(output_folder, "test.csv"), index=False)

    print(f"Features saved to {output_folder}")
    print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# Extract and save Gabor features
extract_and_save_gabor_features(dataset_path, output_folder)


Classification using gabor features using decision tree

In [None]:


import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Paths to the combined feature files
combined_features_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\GaborFeatures"

# Load train, val, and test datasets
try:
    train_df = pd.read_csv(os.path.join(combined_features_path, "train.csv"))
    val_df = pd.read_csv(os.path.join(combined_features_path, "val.csv"))
    test_df = pd.read_csv(os.path.join(combined_features_path, "test.csv"))
except FileNotFoundError as e:
    print(f"Error loading files: {e}")
    raise


# Combine the train and val datasets
combined_train_val_df = pd.concat([train_df, val_df], ignore_index=True)

# Ensure label column exists in both combined train-val and test datasets
if 'label' not in combined_train_val_df.columns or 'label' not in test_df.columns:
    raise ValueError("The 'label' column is missing in one of the datasets.")

# Split the features (X) and labels (y) for the training-validation set
X_train_val = combined_train_val_df.drop(columns=['label'])  # Features
y_train_val = combined_train_val_df['label']  # Labels

# Split the features (X) and labels (y) for the test set
X_test = test_df.drop(columns=['label'])  # Features
y_test = test_df['label']  # Labels

# Check for any missing values in the datasets
if X_train_val.isnull().values.any() or X_test.isnull().values.any():
    print("Warning: Missing values found in features. Please handle missing data before training.")

# ---- Grid Search for Hyperparameter Optimization ----
# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=1)

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, 
                           scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train_val, y_train_val)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Optimized Hyperparameters:")
print(best_params)

# Get the best estimator from Grid Search
best_dt_classifier = grid_search.best_estimator_

# ---- Training Evaluation ----
# Make predictions on the training data
y_train_pred = best_dt_classifier.predict(X_train_val)

# Training confusion matrix
train_cm = confusion_matrix(y_train_val, y_train_pred)
print("Training Confusion Matrix:")
print(train_cm)

# Plot training confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(train_cm, annot=True, fmt='d', cmap='Greens', 
            xticklabels=[f'Class {i}' for i in range(len(set(y_train_val)))],
            yticklabels=[f'Class {i}' for i in range(len(set(y_train_val)))])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Training Confusion Matrix')
plt.show()

# Training classification report
print("\nTraining Classification Report:")
print(classification_report(y_train_val, y_train_pred, digits=4))

# Training accuracy
train_accuracy = accuracy_score(y_train_val, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f}")

# ---- Testing Evaluation ----
# Make predictions on the test data
y_test_pred = best_dt_classifier.predict(X_test)

# Testing confusion matrix
test_cm = confusion_matrix(y_test, y_test_pred)
print("Testing Confusion Matrix:")
print(test_cm)

# Plot testing confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(test_cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=[f'Class {i}' for i in range(len(set(y_test)))],
            yticklabels=[f'Class {i}' for i in range(len(set(y_test)))])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Testing Confusion Matrix')
plt.show()

# Testing classification report
print("\nTesting Classification Report:")
print(classification_report(y_test, y_test_pred, digits=4))

# Testing accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Testing Accuracy: {test_accuracy:.4f}")

