Calculate Statistical Features from Training, Validation, and Testing Extracted Deep Features 

In [None]:
import numpy as np
import pickle
import pandas as pd
from scipy.stats import skew, entropy, kurtosis, variation, iqr
import os

# Load extracted features
feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\extracted_features'
stat_feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\statistical_features'
os.makedirs(stat_feature_dir, exist_ok=True)

with open(os.path.join(feature_dir, "train_features.pkl"), 'rb') as f:
    train_features, train_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "val_features.pkl"), 'rb') as f:
    val_features, val_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "test_features.pkl"), 'rb') as f:
    test_features, test_labels = pickle.load(f)

# Convert one-hot encoded labels to single class labels, if needed
if len(train_labels.shape) > 1 and train_labels.shape[1] > 1:
    train_labels = np.argmax(train_labels, axis=1)

if len(val_labels.shape) > 1 and val_labels.shape[1] > 1:
    val_labels = np.argmax(val_labels, axis=1)

if len(test_labels.shape) > 1 and test_labels.shape[1] > 1:
    test_labels = np.argmax(test_labels, axis=1)

# Function to calculate signal-to-noise ratio
def signal_to_noise(f):
    mean = np.mean(f)
    std = np.std(f)
    return mean / (std + 1e-6)  # Adding small constant to avoid division by zero

# Function to calculate more advanced statistical features from deep features
def calculate_statistical_features(features):
    stats_features = []
    for f in features:
        stats = {
            'mean': np.mean(f),
            'std_dev': np.std(f),
            'variance': np.var(f),
            'median': np.median(f),
            'range': np.ptp(f),  # Peak-to-peak range
            'skewness': skew(f),
            'kurtosis': kurtosis(f),
            'entropy': entropy(np.abs(f) + 1e-6),  # Add small constant to avoid log(0)
            'energy': np.sum(f ** 2),  # Sum of squared elements
            'contrast': np.std(f) ** 2,  # Contrast as variance
            'mean_abs_dev': np.mean(np.abs(f - np.mean(f))),
            'min_value': np.min(f),
            'max_value': np.max(f),
            'iqr': iqr(f),  # Interquartile range
            'percentile_25': np.percentile(f, 25),
            'percentile_50': np.percentile(f, 50),  # Median
            'percentile_75': np.percentile(f, 75),
            'signal_to_noise': signal_to_noise(f),
            'coef_of_var': variation(f),  # Coefficient of variation
            'autocorrelation': np.corrcoef(f[:-1], f[1:])[0, 1] if len(f) > 1 else 0,  # Lag-1 autocorrelation
            'shannon_entropy': -np.sum(f * np.log2(f + 1e-6)),  # Shannon entropy for diversity measure
            'root_mean_square': np.sqrt(np.mean(f ** 2)),  # Root mean square
            'harmonic_mean': len(f) / np.sum(1.0 / (f + 1e-6)),  # Harmonic mean
            'geometric_mean': np.exp(np.mean(np.log(f + 1e-6))),  # Geometric mean
            'std_error_mean': np.std(f) / np.sqrt(len(f)),  # Standard error of the mean
            'median_abs_dev': np.median(np.abs(f - np.median(f))),  # Median absolute deviation
        }
        stats_features.append(stats)
    return stats_features

# Function to save features and labels as CSV
def save_statistical_features_as_csv(features, labels, set_name):
    df = pd.DataFrame(features)
    df['label'] = labels  # Ensure labels are a 1D array
    df.to_csv(os.path.join(stat_feature_dir, f"{set_name}_stat_features.csv"), index=False)

# Calculate and save statistical features for each dataset
for set_name, features, labels in [("train", train_features, train_labels), 
                                   ("val", val_features, val_labels), 
                                   ("test", test_features, test_labels)]:
    stats_features = calculate_statistical_features(features)
    save_statistical_features_as_csv(stats_features, labels, set_name)

print("Statistical features calculated and saved in CSV format.")

Load Statistical Features and Train & Evaluate Decision Tree Classifier

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\statistical_features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "train_stat_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "val_stat_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "test_stat_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters on combined training and validation data
clf = DecisionTreeClassifier(
    criterion='entropy', 
    max_depth=10, 
    min_samples_leaf=4, 
    min_samples_split=2, 
    random_state=0
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Using Rulefit on the statistical features

In [None]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define the directory containing the CSV files
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\statistical_features'
# Load the CSV files
train_data = pd.read_csv(os.path.join(input_dir, "train_stat_features.csv"))
val_data = pd.read_csv(os.path.join(input_dir, "val_stat_features.csv"))
test_data = pd.read_csv(os.path.join(input_dir, "test_stat_features.csv"))

# Separate features and labels for each dataset
train_features = train_data.drop(columns=['label']).values
train_labels = train_data['label'].values

val_features = val_data.drop(columns=['label']).values
val_labels = val_data['label'].values

test_features = test_data.drop(columns=['label']).values
test_labels = test_data['label'].values

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\rulefit_rules_on_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Code for evaluating the statistical features (Mutual information and feature importance appraoch)
I used Mutual Information and Feature Importance to directly assess how well your statistical features relate to the class labels. If many features have low MI scores or feature importance values close to zero, they may not be contributing to the classification, and we might need to generate new features or revisit the feature extraction method.


In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_classif

# Load the statistical features and labels from CSV
stat_feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\statistical_features'

# Load the CSV file
train_df = pd.read_csv(f"{stat_feature_dir}/train_stat_features.csv")

# Separate features and labels
train_features = train_df.drop(columns=['label']).values  # Drop the 'label' column to get features
train_labels = train_df['label'].values  # Extract the labels

# Names of the statistical features (based on the columns in the CSV)
stat_feature_names = train_df.columns[:-1]  # All columns except 'label' are feature names

# 1. Mutual Information (MI) for each feature
# Mutual Information helps to see which features carry the most information about the target
mi_scores = mutual_info_classif(train_features, train_labels, discrete_features=False)
mi_scores_df = pd.DataFrame({'Feature': stat_feature_names, 'MI Score': mi_scores})
mi_scores_df.sort_values(by='MI Score', ascending=False, inplace=True)

# Display MI scores for each feature
print(mi_scores_df)


Drop features with more than 50% of data 0s

In [None]:
import os
import pandas as pd

# Define directories
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\statistical_features'
output_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\filtered_statistical_features'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load datasets
train_data = pd.read_csv(os.path.join(input_dir, "train_stat_features.csv"))
val_data = pd.read_csv(os.path.join(input_dir, "val_stat_features.csv"))
test_data = pd.read_csv(os.path.join(input_dir, "test_stat_features.csv"))

# Function to find features with more than 50% zeros, excluding the label column
def find_zero_features(df):
    zero_features = [col for col in df.columns if col != 'label' and (df[col] == 0).mean() > 0.5]
    return zero_features

# Identify features with more than 50% zeros across all datasets
train_zero_features = find_zero_features(train_data)
val_zero_features = find_zero_features(val_data)
test_zero_features = find_zero_features(test_data)

# Union of features with more than 50% zeros across all datasets
all_zero_features = set(train_zero_features).union(set(val_zero_features)).union(set(test_zero_features))

# Display features with more than 50% zeros in any dataset
print(f"Features with more than 50% zeros in any dataset: {list(all_zero_features)}")

# Drop these features from all datasets, keeping the label column
train_filtered = train_data.drop(columns=all_zero_features)
val_filtered = val_data.drop(columns=all_zero_features)
test_filtered = test_data.drop(columns=all_zero_features)

# Save the filtered datasets
train_filtered.to_csv(os.path.join(output_dir, "filtered_train_stat_features.csv"), index=False)
val_filtered.to_csv(os.path.join(output_dir, "filtered_val_stat_features.csv"), index=False)
test_filtered.to_csv(os.path.join(output_dir, "filtered_test_stat_features.csv"), index=False)

print("Filtered datasets with selected features and labels have been saved in the 'filtered_statistical_features' directory.")


tree evaluation using filtered_statistical_features

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\filtered_statistical_features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "filtered_train_stat_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "filtered_val_stat_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "filtered_test_stat_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters on combined training and validation data
clf = DecisionTreeClassifier(
    criterion='entropy', 
    max_depth=10, 
    min_samples_leaf=4, 
    min_samples_split=2, 
    random_state=0
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Rulefit evaluation on filtered statistical features

In [None]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define the directory containing the CSV files
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\filtered_statistical_features'

# Load the CSV files
train_data = pd.read_csv(os.path.join(stat_feature_dir, "filtered_train_stat_features.csv"))
val_data = pd.read_csv(os.path.join(stat_feature_dir, "filtered_val_stat_features.csv"))
test_data = pd.read_csv(os.path.join(stat_feature_dir, "filtered_test_stat_features.csv"))

# Separate features and labels for each dataset
train_features = train_data.drop(columns=['label']).values
train_labels = train_data['label'].values

val_features = val_data.drop(columns=['label']).values
val_labels = val_data['label'].values

test_features = test_data.drop(columns=['label']).values
test_labels = test_data['label'].values

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\filtered_statistical_features\rulefit_rules_on_filtered_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")
			


applying mutual information based approach to select 18 best best features among filtered_statistical_features

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Define the input directory where the selected statistical features are saved
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\filtered_statistical_features'

# Define the output directory for saving selected features after CFS
output_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\18 best features'
os.makedirs(output_dir, exist_ok=True)

# Function to apply CFS and save selected features
def apply_mic_and_save(data, selected_columns, output_name):
    # Separate features and labels
    X = data[selected_columns]  # Use only the selected features
    y = data['label']  # Labels
    
    # Add the label back to the selected features
    selected_data = pd.concat([X, y], axis=1)
    
    # Save the selected features with labels
    selected_data.to_csv(os.path.join(output_dir, output_name), index=False)

# Step 1: Perform feature selection on the training set
train_data = pd.read_csv(os.path.join(input_dir, "filtered_train_stat_features.csv"))

# Separate features and labels for the training set
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Labels

# Use mutual information on training data to select top 18 features
selector = SelectKBest(score_func=mutual_info_classif, k=18)  # Select top 18 features
selector.fit(X_train, y_train)

# Get the selected feature names based on training set
selected_columns = X_train.columns[selector.get_support()]
print(f"Selected features based on training set: {selected_columns}")

# Step 2: Apply the same selected features to the training, validation, and testing datasets

# Apply CFS to the training set (using selected features)
apply_mic_and_save(train_data, selected_columns, "18_training_selected_features.csv")

# Apply the same selected features to the validation set
val_data = pd.read_csv(os.path.join(input_dir, "filtered_val_stat_features.csv"))
apply_mic_and_save(val_data, selected_columns, "18_validation_selected_features.csv")

# Apply the same selected features to the testing set
test_data = pd.read_csv(os.path.join(input_dir, "filtered_test_stat_features.csv"))
apply_mic_and_save(test_data, selected_columns, "18_testing_selected_features.csv")

print("Feature selection completed and saved.")


Evaluating decision tree using 18 selected statistical features

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\18 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "18_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "18_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "18_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters on combined training and validation data
clf = DecisionTreeClassifier(
    criterion='entropy', 
    max_depth=10, 
    min_samples_leaf=4, 
    min_samples_split=2, 
    random_state=0
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Using Rulefit on 18 features

In [None]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Define the directory containing the CSV files
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\18 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(input_dir, "18_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "18_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "18_testing_selected_features.csv"))


# Separate features and labels for each dataset
train_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values
print(f"Training Features Shape: {train_features.shape}")

val_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values
print(f"Validation Features Shape: {val_features.shape}")

test_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values
print(f"Testing Features Shape: {test_features.shape}")

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\rulefit_rules_on_18_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Selecting 15 statistical features based on mutual importance

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Define the input directory where the selected statistical features are saved
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\filtered_statistical_features'

# Define the output directory for saving selected features after CFS
output_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\15 best features'
os.makedirs(output_dir, exist_ok=True)

# Function to apply CFS and save selected features
def apply_mic_and_save(data, selected_columns, output_name):
    # Separate features and labels
    X = data[selected_columns]  # Use only the selected features
    y = data['label']  # Labels
    
    # Add the label back to the selected features
    selected_data = pd.concat([X, y], axis=1)
    
    # Save the selected features with labels
    selected_data.to_csv(os.path.join(output_dir, output_name), index=False)

# Step 1: Perform feature selection on the training set
train_data = pd.read_csv(os.path.join(input_dir, "filtered_train_stat_features.csv"))

# Separate features and labels for the training set
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Labels

# Use mutual information on training data to select top 3 features
selector = SelectKBest(score_func=mutual_info_classif, k=15)  # Select top 3 features
selector.fit(X_train, y_train)

# Get the selected feature names based on training set
selected_columns = X_train.columns[selector.get_support()]
print(f"Selected features based on training set: {selected_columns}")

# Step 2: Apply the same selected features to the training, validation, and testing datasets

# Apply CFS to the training set (using selected features)
apply_mic_and_save(train_data, selected_columns, "15_training_selected_features.csv")

# Apply the same selected features to the validation set
val_data = pd.read_csv(os.path.join(input_dir, "filtered_val_stat_features.csv"))
apply_mic_and_save(val_data, selected_columns, "15_validation_selected_features.csv")

# Apply the same selected features to the testing set
test_data = pd.read_csv(os.path.join(input_dir, "filtered_test_stat_features.csv"))
apply_mic_and_save(test_data, selected_columns, "15_testing_selected_features.csv")

print("Feature selection completed and saved.")


Decision tree evaluation using 15 features

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\15 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "15_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "15_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "15_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters on combined training and validation data
clf = DecisionTreeClassifier(
    criterion='entropy', 
    max_depth=10, 
    min_samples_leaf=4, 
    min_samples_split=2, 
    random_state=0
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Rulefit evaluation using 15 best features

In [None]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Define the directory containing the CSV files
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\15 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(input_dir, "15_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "15_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "15_testing_selected_features.csv"))


# Separate features and labels for each dataset
train_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values
print(f"Training Features Shape: {train_features.shape}")

val_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values
print(f"Validation Features Shape: {val_features.shape}")

test_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values
print(f"Testing Features Shape: {test_features.shape}")

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\rulefit_rules_on_15_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


selecting 12 most important features among filtered features

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Define the input directory where the selected statistical features are saved
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\filtered_statistical_features'

# Define the output directory for saving selected features after CFS
output_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\12 best features'
os.makedirs(output_dir, exist_ok=True)

# Function to apply CFS and save selected features
def apply_mic_and_save(data, selected_columns, output_name):
    # Separate features and labels
    X = data[selected_columns]  # Use only the selected features
    y = data['label']  # Labels
    
    # Add the label back to the selected features
    selected_data = pd.concat([X, y], axis=1)
    
    # Save the selected features with labels
    selected_data.to_csv(os.path.join(output_dir, output_name), index=False)

# Step 1: Perform feature selection on the training set
train_data = pd.read_csv(os.path.join(input_dir, "filtered_train_stat_features.csv"))

# Separate features and labels for the training set
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Labels

# Use mutual information on training data to select top 3 features
selector = SelectKBest(score_func=mutual_info_classif, k=12)  # Select top 3 features
selector.fit(X_train, y_train)

# Get the selected feature names based on training set
selected_columns = X_train.columns[selector.get_support()]
print(f"Selected features based on training set: {selected_columns}")

# Step 2: Apply the same selected features to the training, validation, and testing datasets

# Apply CFS to the training set (using selected features)
apply_mic_and_save(train_data, selected_columns, "12_training_selected_features.csv")

# Apply the same selected features to the validation set
val_data = pd.read_csv(os.path.join(input_dir, "filtered_val_stat_features.csv"))
apply_mic_and_save(val_data, selected_columns, "12_validation_selected_features.csv")

# Apply the same selected features to the testing set
test_data = pd.read_csv(os.path.join(input_dir, "filtered_test_stat_features.csv"))
apply_mic_and_save(test_data, selected_columns, "12_testing_selected_features.csv")

print("Feature selection completed and saved.")


evaluating decision tree on 12 most important features

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\12 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "12_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "12_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "12_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters on combined training and validation data
clf = DecisionTreeClassifier(
    criterion='entropy', 
    max_depth=10, 
    min_samples_leaf=4, 
    min_samples_split=2, 
    random_state=0
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Evalutaion rulefit on 12 most important features

In [None]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Define the directory containing the CSV files
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\12 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(input_dir, "12_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "12_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "12_testing_selected_features.csv"))


# Separate features and labels for each dataset
train_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values
print(f"Training Features Shape: {train_features.shape}")

val_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values
print(f"Validation Features Shape: {val_features.shape}")

test_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values
print(f"Testing Features Shape: {test_features.shape}")

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\rulefit_rules_on_12_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Selection 9 best features

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Define the input directory where the selected statistical features are saved
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\filtered_statistical_features'

# Define the output directory for saving selected features after CFS
output_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\9 best features'
os.makedirs(output_dir, exist_ok=True)

# Function to apply CFS and save selected features
def apply_mic_and_save(data, selected_columns, output_name):
    # Separate features and labels
    X = data[selected_columns]  # Use only the selected features
    y = data['label']  # Labels
    
    # Add the label back to the selected features
    selected_data = pd.concat([X, y], axis=1)
    
    # Save the selected features with labels
    selected_data.to_csv(os.path.join(output_dir, output_name), index=False)

# Step 1: Perform feature selection on the training set
train_data = pd.read_csv(os.path.join(input_dir, "filtered_train_stat_features.csv"))

# Separate features and labels for the training set
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Labels

# Use mutual information on training data to select top 3 features
selector = SelectKBest(score_func=mutual_info_classif, k=9)  # Select top 3 features
selector.fit(X_train, y_train)

# Get the selected feature names based on training set
selected_columns = X_train.columns[selector.get_support()]
print(f"Selected features based on training set: {selected_columns}")

# Step 2: Apply the same selected features to the training, validation, and testing datasets

# Apply CFS to the training set (using selected features)
apply_mic_and_save(train_data, selected_columns, "9_training_selected_features.csv")

# Apply the same selected features to the validation set
val_data = pd.read_csv(os.path.join(input_dir, "filtered_val_stat_features.csv"))
apply_mic_and_save(val_data, selected_columns, "9_validation_selected_features.csv")

# Apply the same selected features to the testing set
test_data = pd.read_csv(os.path.join(input_dir, "filtered_test_stat_features.csv"))
apply_mic_and_save(test_data, selected_columns, "9_testing_selected_features.csv")

print("Feature selection completed and saved.")


Decision tree evaluation on 9 selected features

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\9 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "9_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "9_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "9_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters on combined training and validation data
clf = DecisionTreeClassifier(
    criterion='entropy', 
    max_depth=10, 
    min_samples_leaf=4, 
    min_samples_split=2, 
    random_state=0
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Rulefit evaluation using 9 best features

In [None]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Define the directory containing the CSV files
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\9 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(input_dir, "9_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "9_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "9_testing_selected_features.csv"))


# Separate features and labels for each dataset
train_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values
print(f"Training Features Shape: {train_features.shape}")

val_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values
print(f"Validation Features Shape: {val_features.shape}")

test_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values
print(f"Testing Features Shape: {test_features.shape}")

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\rulefit_rules_on_9_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Selecting 6 most important features based on mutual importance 

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Define the input directory where the selected statistical features are saved
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\filtered_statistical_features'

# Define the output directory for saving selected features after CFS
output_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\6 best features'
os.makedirs(output_dir, exist_ok=True)

# Function to apply CFS and save selected features
def apply_mic_and_save(data, selected_columns, output_name):
    # Separate features and labels
    X = data[selected_columns]  # Use only the selected features
    y = data['label']  # Labels
    
    # Add the label back to the selected features
    selected_data = pd.concat([X, y], axis=1)
    
    # Save the selected features with labels
    selected_data.to_csv(os.path.join(output_dir, output_name), index=False)

# Step 1: Perform feature selection on the training set
train_data = pd.read_csv(os.path.join(input_dir, "filtered_train_stat_features.csv"))

# Separate features and labels for the training set
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Labels

# Use mutual information on training data to select top 3 features
selector = SelectKBest(score_func=mutual_info_classif, k=6)  # Select top 3 features
selector.fit(X_train, y_train)

# Get the selected feature names based on training set
selected_columns = X_train.columns[selector.get_support()]
print(f"Selected features based on training set: {selected_columns}")

# Step 2: Apply the same selected features to the training, validation, and testing datasets

# Apply CFS to the training set (using selected features)
apply_mic_and_save(train_data, selected_columns, "6_training_selected_features.csv")

# Apply the same selected features to the validation set
val_data = pd.read_csv(os.path.join(input_dir, "filtered_val_stat_features.csv"))
apply_mic_and_save(val_data, selected_columns, "6_validation_selected_features.csv")

# Apply the same selected features to the testing set
test_data = pd.read_csv(os.path.join(input_dir, "filtered_test_stat_features.csv"))
apply_mic_and_save(test_data, selected_columns, "6_testing_selected_features.csv")

print("Feature selection completed and saved.")


Evaluating decision tree on 6 important features

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\6 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "6_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "6_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "6_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters on combined training and validation data
clf = DecisionTreeClassifier(
    criterion='entropy', 
    max_depth=10, 
    min_samples_leaf=4, 
    min_samples_split=2, 
    random_state=0
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Evaluating Rulefit on 6 important features

In [None]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define the directory containing the CSV files
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\6 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(input_dir, "6_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "6_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "6_testing_selected_features.csv"))

# Separate features and labels for each dataset
train_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values
print(f"Training Features Shape: {train_features.shape}")

val_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values
print(f"Validation Features Shape: {val_features.shape}")

test_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values
print(f"Testing Features Shape: {test_features.shape}")

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_df.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Evaluate on the combined training set
train_predictions = rf.predict(train_val_features)
train_predictions_discrete = np.round(train_predictions).astype(int)
train_accuracy = accuracy_score(train_val_labels, train_predictions_discrete)

print(f"Combined Training Accuracy: {train_accuracy:.4f}")
train_conf_matrix = confusion_matrix(train_val_labels, train_predictions_discrete)
print("Combined Training Confusion Matrix:")
print(train_conf_matrix)
print("Combined Training Classification Report:")
print(classification_report(train_val_labels, train_predictions_discrete, digits=4))

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\rulefit_rules_on_6_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Selecting 3 most important features

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Define the input directory where the selected statistical features are saved
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\filtered_statistical_features'

# Define the output directory for saving selected features after CFS
output_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\3 best features'
os.makedirs(output_dir, exist_ok=True)

# Function to apply CFS and save selected features
def apply_mic_and_save(data, selected_columns, output_name):
    # Separate features and labels
    X = data[selected_columns]  # Use only the selected features
    y = data['label']  # Labels
    
    # Add the label back to the selected features
    selected_data = pd.concat([X, y], axis=1)
    
    # Save the selected features with labels
    selected_data.to_csv(os.path.join(output_dir, output_name), index=False)

# Step 1: Perform feature selection on the training set
train_data = pd.read_csv(os.path.join(input_dir, "filtered_train_stat_features.csv"))

# Separate features and labels for the training set
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Labels

# Use mutual information on training data to select top 3 features
selector = SelectKBest(score_func=mutual_info_classif, k=3)  # Select top 3 features
selector.fit(X_train, y_train)

# Get the selected feature names based on training set
selected_columns = X_train.columns[selector.get_support()]
print(f"Selected features based on training set: {selected_columns}")

# Step 2: Apply the same selected features to the training, validation, and testing datasets

# Apply CFS to the training set (using selected features)
apply_mic_and_save(train_data, selected_columns, "3_training_selected_features.csv")

# Apply the same selected features to the validation set
val_data = pd.read_csv(os.path.join(input_dir, "filtered_val_stat_features.csv"))
apply_mic_and_save(val_data, selected_columns, "3_validation_selected_features.csv")

# Apply the same selected features to the testing set
test_data = pd.read_csv(os.path.join(input_dir, "filtered_test_stat_features.csv"))
apply_mic_and_save(test_data, selected_columns, "3_testing_selected_features.csv")

print("Feature selection completed and saved.")


Decision tree evaluation on 3 most important features

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\3 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "3_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "3_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "3_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Train Decision Tree Classifier with specified hyperparameters on combined training and validation data
clf = DecisionTreeClassifier(
    criterion='entropy', 
    max_depth=10, 
    min_samples_leaf=4, 
    min_samples_split=2, 
    random_state=0
)
clf.fit(combined_features, combined_labels)

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

# Evaluate on the combined training set
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)
print(f"Combined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))
print_classification_report("Combined Training", combined_labels, train_predictions)

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))
print_classification_report("Test", test_labels, test_predictions)


Rulefit evaluation using 3 most important features

In [None]:
import os
import pandas as pd
import numpy as np
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Define the directory containing the CSV files
input_dir = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\3 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(input_dir, "3_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "3_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "3_testing_selected_features.csv"))


# Separate features and labels for each dataset
train_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values
print(f"Training Features Shape: {train_features.shape}")

val_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values
print(f"Validation Features Shape: {val_features.shape}")

test_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values
print(f"Testing Features Shape: {test_features.shape}")

# Combine training and validation data for final training
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"Combined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# Define feature names dynamically based on the training data
feature_names = train_data.columns[:-1].tolist()  # Exclude the label column

# Initialize RuleFit model
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=2000, random_state=42)

# Fit the RuleFit model to combined training and validation data
rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

# Convert continuous predictions to discrete class labels by rounding
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure the predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# Calculate testing accuracy
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report
report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("Classification Report:")
print(report)

# Extract rules from the RuleFit model
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

# Display the top rules
print("Top Rules from RuleFit Model:")
print(rules.head())

# Save rules to a text file
output_file_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\rulefit_rules_on_3_statistical_features.txt'
rules.to_csv(output_file_path, index=False)
print(f"Rules have been saved to {output_file_path}.")


Feature selection using accumulated local effects (ALE)

In [None]:
# Install required libraries (if not already installed)
# pip install alibi pandas numpy scikit-learn

from alibi.explainers import ALE
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import os

# # Load dataset
# Define the directory containing the CSV files
combined_features_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\statistical_features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(combined_features_path, "train_stat_features.csv"))
val_df = pd.read_csv(os.path.join(combined_features_path, "val_stat_features.csv"))
test_df = pd.read_csv(os.path.join(combined_features_path, "test_stat_features.csv"))


# # # Define the directory containing the CSV files
# combined_features_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\3 best features'

# # Load the CSV files
# train_df = pd.read_csv(os.path.join(combined_features_path, "3_training_selected_features.csv"))
# val_df = pd.read_csv(os.path.join(combined_features_path, "3_validation_selected_features.csv"))
# test_df = pd.read_csv(os.path.join(combined_features_path, "3_testing_selected_features.csv"))

# Combine train and validation sets
train_val_df = pd.concat([train_df, val_df], ignore_index=True)
X_train_val = train_val_df.drop(columns=["label"])
y_train_val = train_val_df["label"]
X_test = test_df.drop(columns=["label"])
y_test = test_df["label"]

# Train Decision Tree
dt_model = DecisionTreeClassifier(random_state=1)
dt_model.fit(X_train_val, y_train_val)

# Wrapper for the predictor to return probabilities (required by ALE)
def predictor(X):
    return dt_model.predict_proba(X)

# Apply ALE
ale = ALE(predictor, feature_names=X_train_val.columns.tolist())
ale_exp = ale.explain(X_train_val.to_numpy())  # Ensure input is a NumPy array

# Compute feature variance from ALE
ale_variances = [np.var(ale_exp.ale_values[i]) for i in range(len(X_train_val.columns))]
feature_importance = pd.DataFrame({
    "feature": X_train_val.columns,
    "ale_variance": ale_variances
}).sort_values(by="ale_variance", ascending=False)

# Select features with highest ALE variance
top_features = feature_importance["feature"].iloc[:3]  # Top 10 features
X_train_val_selected = X_train_val[top_features]
X_test_selected = X_test[top_features]

# Retrain model with selected features
dt_model_selected = DecisionTreeClassifier(random_state=1)
dt_model_selected.fit(X_train_val_selected, y_train_val)

# Evaluate
y_pred = dt_model_selected.predict(X_test_selected)
print("Accuracy with selected features:", accuracy_score(y_test, y_pred))

# Display feature importance
print("\nTop features based on ALE variance:")
print(feature_importance.head(10))


Combined ALE variance and mutual information to ensure robust feature selection.

In [None]:
# Install required libraries (if not already installed)
# pip install alibi pandas numpy scikit-learn

from alibi.explainers import ALE
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
import os

# Load dataset
combined_features_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\statistical_features'

train_df = pd.read_csv(os.path.join(combined_features_path, "train_stat_features.csv"))
val_df = pd.read_csv(os.path.join(combined_features_path, "val_stat_features.csv"))
test_df = pd.read_csv(os.path.join(combined_features_path, "test_stat_features.csv"))

# Combine train and validation sets
train_val_df = pd.concat([train_df, val_df], ignore_index=True)
X_train_val = train_val_df.drop(columns=["label"])
y_train_val = train_val_df["label"]
X_test = test_df.drop(columns=["label"])
y_test = test_df["label"]

# Standardize the features
scaler = StandardScaler()
X_train_val = pd.DataFrame(scaler.fit_transform(X_train_val), columns=X_train_val.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Train Decision Tree for ALE computation
dt_model = DecisionTreeClassifier(random_state=1)
dt_model.fit(X_train_val, y_train_val)

# Wrapper for the predictor to return probabilities
def predictor(X):
    return dt_model.predict_proba(X)

# Apply ALE
ale = ALE(predictor, feature_names=X_train_val.columns.tolist())
ale_exp = ale.explain(X_train_val.to_numpy())  # Ensure input is a NumPy array

# Compute ALE variances
ale_variances = [np.var(ale_exp.ale_values[i]) for i in range(len(X_train_val.columns))]
feature_importance_ale = pd.DataFrame({
    "feature": X_train_val.columns,
    "ale_variance": ale_variances
}).sort_values(by="ale_variance", ascending=False)

# Compute Mutual Information
mi_scores = mutual_info_classif(X_train_val, y_train_val)
feature_importance_mi = pd.DataFrame({
    "feature": X_train_val.columns,
    "mutual_info": mi_scores
}).sort_values(by="mutual_info", ascending=False)

# Combine ALE and Mutual Information scores
combined_scores = feature_importance_ale.merge(
    feature_importance_mi, on="feature"
).assign(combined_score=lambda df: df["ale_variance"] * df["mutual_info"])

# Select top 5 features based on combined score
top_features = combined_scores.sort_values(by="combined_score", ascending=False)["feature"].iloc[:3]
X_train_val_selected = X_train_val[top_features]
X_test_selected = X_test[top_features]

# Hyperparameter tuning for Decision Tree
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=1), param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train_val_selected, y_train_val)

# Best Decision Tree model
best_dt_model = grid_search.best_estimator_

# Evaluate on the test set
y_pred = best_dt_model.predict(X_test_selected)
print("Accuracy with selected features and tuned Decision Tree:", accuracy_score(y_test, y_pred))

# Display top features based on combined score
print("\nTop features based on ALE and Mutual Information:")
print(combined_scores.sort_values(by="combined_score", ascending=False).head(10))


In [None]:
# Install required libraries (if not already installed)
# pip install alibi pandas numpy scikit-learn

from alibi.explainers import ALE
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import os

# Load dataset
combined_features_path = r'E:\Abroad period research\new idea implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\statistical_features'

train_df = pd.read_csv(os.path.join(combined_features_path, "train_stat_features.csv"))
val_df = pd.read_csv(os.path.join(combined_features_path, "val_stat_features.csv"))
test_df = pd.read_csv(os.path.join(combined_features_path, "test_stat_features.csv"))

# Combine train and validation sets
train_val_df = pd.concat([train_df, val_df], ignore_index=True)
X_train_val = train_val_df.drop(columns=["label"])
y_train_val = train_val_df["label"]
X_test = test_df.drop(columns=["label"])
y_test = test_df["label"]

# Standardize the features
scaler = StandardScaler()
X_train_val_scaled = pd.DataFrame(scaler.fit_transform(X_train_val), columns=X_train_val.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Train Decision Tree for ALE computation
dt_model = DecisionTreeClassifier(random_state=1)
dt_model.fit(X_train_val_scaled, y_train_val)

# Wrapper for the predictor to return probabilities
def predictor(X):
    return dt_model.predict_proba(X)

# Apply ALE with improved handling
ale = ALE(predictor, feature_names=X_train_val.columns.tolist())
ale_exp = ale.explain(X_train_val_scaled.to_numpy())  # Ensure input is a NumPy array

# Compute ALE variances and smooth variance for stability
ale_variances = [np.var(ale_exp.ale_values[i]) for i in range(len(X_train_val_scaled.columns))]
ale_smooth_variances = [np.mean(np.gradient(ale_exp.ale_values[i])**2) for i in range(len(X_train_val_scaled.columns))]
feature_importance_ale = pd.DataFrame({
    "feature": X_train_val.columns,
    "ale_variance": ale_variances,
    "ale_smooth_variance": ale_smooth_variances
}).sort_values(by="ale_variance", ascending=False)

# Combine ALE scores for better robustness
feature_importance_ale["combined_ale_score"] = feature_importance_ale["ale_variance"] + feature_importance_ale["ale_smooth_variance"]

# Compute Mutual Information
mi_scores = mutual_info_classif(X_train_val_scaled, y_train_val)
feature_importance_mi = pd.DataFrame({
    "feature": X_train_val.columns,
    "mutual_info": mi_scores
}).sort_values(by="mutual_info", ascending=False)

# Combine ALE and Mutual Information scores
combined_scores = feature_importance_ale.merge(
    feature_importance_mi, on="feature"
).assign(combined_score=lambda df: df["combined_ale_score"] * df["mutual_info"])

# Select top features based on combined score
top_features = combined_scores.sort_values(by="combined_score", ascending=False)["feature"].iloc[:3]  # Increased to top 5
X_train_val_selected = X_train_val_scaled[top_features]
X_test_selected = X_test_scaled[top_features]

# Hyperparameter tuning for Decision Tree with expanded parameter grid
param_grid = {
    'max_depth': [3, 5, 10, 15, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8],
    'criterion': ['gini', 'entropy']
}
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=1), param_grid=param_grid, cv=10, scoring='accuracy', verbose=1)
grid_search.fit(X_train_val_selected, y_train_val)

# Best Decision Tree model
best_dt_model = grid_search.best_estimator_

# Evaluate on the test set
y_pred = best_dt_model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print(f"Final Accuracy with selected features and tuned Decision Tree: {accuracy:.4f}")

# Display top features based on combined score
print("\nTop features based on ALE and Mutual Information:")
print(combined_scores.sort_values(by="combined_score", ascending=False).head(10))

# Save combined feature importance for future analysis
combined_scores.to_csv("feature_importance_analysis.csv", index=False)
