Mobilenet model trainign computational cost

In [None]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import tensorflow as tf
import psutil

# Track memory usage
def memory_usage():
    return psutil.Process().memory_info().rss / (1024 * 1024)  # Convert to MB

# Track execution time
def execution_time(start_time):
    return round(time.time() - start_time, 2)

# Define image directory and class labels
image_dir = r"E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\lung_colon_image_set\dataset"
class_labels = os.listdir(image_dir)  # Auto-detect class folders

# Load dataset
image_paths, labels = [], []
for label in class_labels:
    class_path = os.path.join(image_dir, label)
    for img in os.listdir(class_path):
        image_paths.append(os.path.join(class_path, img))
        labels.append(label)

# Create DataFrame
image_df = pd.DataFrame({"image_path": image_paths, "label": labels})

# Ensure dataset is not empty
if image_df.empty:
    raise ValueError("Dataset is empty. Check image directory path!")


# Split dataset 
train_df, test_df = train_test_split(image_df, test_size=0.2, stratify=image_df['label'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)

# Image Data Generators
data_gen = ImageDataGenerator(rescale=1./255)
train_gen = data_gen.flow_from_dataframe(train_df, x_col='image_path', y_col='label', target_size=(224, 224), class_mode='categorical', batch_size=32)
val_gen = data_gen.flow_from_dataframe(val_df, x_col='image_path', y_col='label', target_size=(224, 224), class_mode='categorical', batch_size=32)
test_gen = data_gen.flow_from_dataframe(test_df, x_col='image_path', y_col='label', target_size=(224, 224), class_mode='categorical', batch_size=32, shuffle=False)

# Load MobileNetV2 base model
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False  # Freeze layers

# Add custom layers
x = GlobalAveragePooling2D()(base_model.output)
x = Dense(128, activation='relu')(x)
out = Dense(len(class_labels), activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=out)

# Display Model Parameters & FLOPs
trainable_params = np.sum([np.prod(v.shape) for v in model.trainable_weights])
total_params = np.sum([np.prod(v.shape) for v in model.weights])
flops = 300 * 10**6  # Approx FLOPs for MobileNetV2

print(f"Total Parameters: {total_params:,}")
print(f"Trainable Parameters: {trainable_params:,}")
print(f"MobileNetV2 FLOPs (Approx): {flops:,}")

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train model & track execution time
start_time = time.time()
train_start_memory = memory_usage()

history = model.fit(train_gen, validation_data=val_gen, epochs=50)

train_end_memory = memory_usage()
train_time = execution_time(start_time)

# Evaluate model
y_true = test_df['label'].map(lambda x: class_labels.index(x)).values
y_pred_probs = model.predict(test_gen)
y_pred = np.argmax(y_pred_probs, axis=1)

# Classification report
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=class_labels))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


# Save model
model.save("mobilenetv2_trained_model.h5")

# Display Computational Cost Summary
print("\n===== COMPUTATIONAL COST SUMMARY =====")
print(f"Model Training Time: {train_time} seconds")
print(f"Memory Used During Training: {train_end_memory - train_start_memory:.2f} MB")
print(f"Total Model Parameters: {total_params:,}")
print(f"Trainable Parameters: {trainable_params:,}")
print(f"MobileNetV2 FLOPs (Approx): {flops:,}")

# Save computational cost to a file for paper
with open("computational_cost_summary.txt", "w") as f:
    f.write(f"Model Training Time: {train_time} seconds\n")
    f.write(f"Memory Used During Training: {train_end_memory - train_start_memory:.2f} MB\n")
    f.write(f"Total Model Parameters: {total_params:,}\n")
    f.write(f"Trainable Parameters: {trainable_params:,}\n")
    f.write(f"MobileNetV2 FLOPs (Approx): {flops:,}\n")


Deep Feature Extraction

In [None]:
import time  # Track execution time
from tensorflow.keras.models import load_model, Model
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
import os
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Start measuring total execution time
start_time = time.time()

# Function to print with Markdown (optional)
def printmd(string):
    display(Markdown(string))

# -----------------------------------
# Load and preprocess dataset
# -----------------------------------
image_dir = Path(r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\Results on just chest region segmented dataset\justchest_Unet_Segmented_Dataset')

# Make sure the path exists
if not image_dir.exists():
    raise FileNotFoundError(f"The directory {image_dir} does not exist!")

# Load image file paths
filepaths = list(image_dir.glob('**/*.png'))

# Check if files are found
if len(filepaths) == 0:
    raise ValueError(f"No .png files found in {image_dir}. Check the directory and files!")

print(f"Total images found: {len(filepaths)}")

# Extract labels from directory structure
labels = [path.parent.name for path in filepaths]

# Convert to pandas DataFrame
filepaths = pd.Series(filepaths, name='Filepath').astype(str)
labels = pd.Series(labels, name='Label')

# Combine into a dataframe
image_df = pd.concat([filepaths, labels], axis=1)
image_df = image_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Dataframe head:\n{image_df.head()}")

# -----------------------------------
# Split data into training, validation, and test sets
# -----------------------------------
train_df, temp_df = train_test_split(image_df, train_size=0.7, shuffle=True, random_state=1)
val_df, test_df = train_test_split(temp_df, train_size=0.5, shuffle=True, random_state=1)

print(f"Train samples: {len(train_df)}, Val samples: {len(val_df)}, Test samples: {len(test_df)}")

# -----------------------------------
# Create data generators with augmentation
# -----------------------------------
def create_gen():
    train_generator = ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input,
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    val_test_generator = ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
    )

    train_images = train_generator.flow_from_dataframe(
        dataframe=train_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=32,
        shuffle=True,
        seed=0
    )

    val_images = val_test_generator.flow_from_dataframe(
        dataframe=val_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=32,
        shuffle=False
    )

    test_images = val_test_generator.flow_from_dataframe(
        dataframe=test_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=32,
        shuffle=False
    )

    return train_images, val_images, test_images

# -----------------------------------
# Load the pre-trained model
# -----------------------------------
model_path = r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\Results on just chest region segmented dataset\Mobilenetv2_finetuned_with_CLR_and_GradientAccum.h5'

if not os.path.exists(model_path):
    raise FileNotFoundError(f"The model file was not found at {model_path}")

loaded_model = load_model(model_path)
print("Model loaded successfully.")

# -----------------------------------
# Define the feature extraction model
# -----------------------------------
feature_extractor = Model(inputs=loaded_model.input, outputs=loaded_model.layers[-4].output)
print("Feature extractor model created.")

# -----------------------------------
# Directory to save extracted features
# -----------------------------------
feature_dir = r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\Results on just chest region segmented dataset\extracted_features_just_for_time_Calculation'
os.makedirs(feature_dir, exist_ok=True)

# -----------------------------------
# Feature extraction function
# -----------------------------------
def extract_features(data_gen, set_name):
    features = []
    labels = []

    steps = data_gen.samples // data_gen.batch_size + 1
    print(f"Extracting features for {set_name} set...")

    for i in range(steps):
        batch_images, batch_labels = next(data_gen)
        batch_features = feature_extractor.predict(batch_images, verbose=0)
        features.extend(batch_features)
        labels.extend(batch_labels)

    # Convert to numpy arrays
    features = np.array(features)
    labels = np.array(labels)

    # Save features and labels
    feature_file = os.path.join(feature_dir, f"{set_name}_features.pkl")
    with open(feature_file, 'wb') as f:
        pickle.dump((features, labels), f)

    print(f"{set_name.capitalize()} features saved to {feature_file}")

# -----------------------------------
# Generate data
# -----------------------------------
train_images, val_images, test_images = create_gen()

# -----------------------------------
# Extract features for training, validation, and test sets
# -----------------------------------
extract_features(train_images, "train")
extract_features(val_images, "val")
extract_features(test_images, "test")

# -----------------------------------
# Total execution time
# -----------------------------------
end_time = time.time()
elapsed_time = end_time - start_time
print(f"\n Total Execution Time: {elapsed_time:.2f} seconds")


Statistical Feature Caculation

In [None]:
import numpy as np
import pickle
import pandas as pd
from scipy.stats import skew, entropy, kurtosis, variation, iqr
import os
import time  # Added for timing

# Load extracted features
feature_dir = r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\Results on just chest region segmented dataset\extracted_features_just_for_time_Calculation'
stat_feature_dir = r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\Results on just chest region segmented dataset\statistical_features_just_for_time_Calculation'
os.makedirs(stat_feature_dir, exist_ok=True)

with open(os.path.join(feature_dir, "train_features.pkl"), 'rb') as f:
    train_features, train_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "val_features.pkl"), 'rb') as f:
    val_features, val_labels = pickle.load(f)
    
with open(os.path.join(feature_dir, "test_features.pkl"), 'rb') as f:
    test_features, test_labels = pickle.load(f)

# Convert one-hot encoded labels to single class labels, if needed
if len(train_labels.shape) > 1 and train_labels.shape[1] > 1:
    train_labels = np.argmax(train_labels, axis=1)

if len(val_labels.shape) > 1 and val_labels.shape[1] > 1:
    val_labels = np.argmax(val_labels, axis=1)

if len(test_labels.shape) > 1 and test_labels.shape[1] > 1:
    test_labels = np.argmax(test_labels, axis=1)

# Function to calculate signal-to-noise ratio
def signal_to_noise(f):
    mean = np.mean(f)
    std = np.std(f)
    return mean / (std + 1e-6)  # Adding small constant to avoid division by zero

# Function to calculate more advanced statistical features from deep features
def calculate_statistical_features(features):
    stats_features = []
    for f in features:
        stats = {
            'mean': np.mean(f),
            'std_dev': np.std(f),
            'variance': np.var(f),
            'median': np.median(f),
            'range': np.ptp(f),  # Peak-to-peak range
            'skewness': skew(f),
            'kurtosis': kurtosis(f),
            'entropy': entropy(np.abs(f) + 1e-6),  # Add small constant to avoid log(0)
            'energy': np.sum(f ** 2),  # Sum of squared elements
            'contrast': np.std(f) ** 2,  # Contrast as variance
            'mean_abs_dev': np.mean(np.abs(f - np.mean(f))),
            'min_value': np.min(f),
            'max_value': np.max(f),
            'iqr': iqr(f),  # Interquartile range
            'percentile_25': np.percentile(f, 25),
            'percentile_50': np.percentile(f, 50),  # Median
            'percentile_75': np.percentile(f, 75),
            'signal_to_noise': signal_to_noise(f),
            'coef_of_var': variation(f),  # Coefficient of variation
            'autocorrelation': np.corrcoef(f[:-1], f[1:])[0, 1] if len(f) > 1 else 0,  # Lag-1 autocorrelation
            'shannon_entropy': -np.sum(f * np.log2(f + 1e-6)),  # Shannon entropy for diversity measure
            'root_mean_square': np.sqrt(np.mean(f ** 2)),  # Root mean square
            'harmonic_mean': len(f) / np.sum(1.0 / (f + 1e-6)),  # Harmonic mean
            'geometric_mean': np.exp(np.mean(np.log(f + 1e-6))),  # Geometric mean
            'std_error_mean': np.std(f) / np.sqrt(len(f)),  # Standard error of the mean
            'median_abs_dev': np.median(np.abs(f - np.median(f))),  # Median absolute deviation
        }
        stats_features.append(stats)
    return stats_features

# Function to save features and labels as CSV
def save_statistical_features_as_csv(features, labels, set_name):
    df = pd.DataFrame(features)
    df['label'] = labels  # Ensure labels are a 1D array
    df.to_csv(os.path.join(stat_feature_dir, f"{set_name}_stat_features.csv"), index=False)

# Start timing
start_time = time.time()

# Calculate and save statistical features for each dataset
for set_name, features, labels in [("train", train_features, train_labels), 
                                   ("val", val_features, val_labels), 
                                   ("test", test_features, test_labels)]:
    print(f"Processing {set_name} features...")
    stats_features = calculate_statistical_features(features)
    save_statistical_features_as_csv(stats_features, labels, set_name)
    print(f"{set_name} features done.")

# End timing
end_time = time.time()

# Calculate total elapsed time
total_time_seconds = end_time - start_time
total_time_minutes = total_time_seconds / 60

print("\nStatistical features calculated and saved in CSV format.")
print(f"\nTotal time taken: {total_time_seconds:.2f} seconds ({total_time_minutes:.2f} minutes)")


ZFMIS Feature Selection 

In [None]:
import os
import pandas as pd
import time  # Import the time module to measure execution time

# Start measuring execution time
start_time = time.time()

# Define directories
input_dir = r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\Results on just chest region segmented dataset\statistical_features_just_for_time_Calculation'
output_dir = r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\Results on just chest region segmented dataset\filtered_statistical_features_just_for_time_Calculation'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load datasets
train_data = pd.read_csv(os.path.join(input_dir, "train_stat_features.csv"))
val_data = pd.read_csv(os.path.join(input_dir, "val_stat_features.csv"))
test_data = pd.read_csv(os.path.join(input_dir, "test_stat_features.csv"))

# Function to find features with more than 50% zeros, excluding the label column
def find_zero_features(df):
    zero_features = [col for col in df.columns if col != 'label' and (df[col] == 0).mean() > 0.5]
    return zero_features

# Identify features with more than 50% zeros across all datasets
train_zero_features = find_zero_features(train_data)
val_zero_features = find_zero_features(val_data)
test_zero_features = find_zero_features(test_data)

# Union of features with more than 50% zeros across all datasets
all_zero_features = set(train_zero_features).union(set(val_zero_features)).union(set(test_zero_features))

# Display features with more than 50% zeros in any dataset
print(f"Features with more than 50% zeros in any dataset: {list(all_zero_features)}")

# Drop these features from all datasets, keeping the label column
train_filtered = train_data.drop(columns=all_zero_features)
val_filtered = val_data.drop(columns=all_zero_features)
test_filtered = test_data.drop(columns=all_zero_features)

# Save the filtered datasets
train_filtered.to_csv(os.path.join(output_dir, "filtered_train_stat_features.csv"), index=False)
val_filtered.to_csv(os.path.join(output_dir, "filtered_val_stat_features.csv"), index=False)
test_filtered.to_csv(os.path.join(output_dir, "filtered_test_stat_features.csv"), index=False)

print("Filtered datasets with selected features and labels have been saved in the 'filtered_statistical_features' directory.")

# End measuring execution time
end_time = time.time()
total_time = end_time - start_time

# Display the total execution time in seconds and minutes
print(f"\nTotal Execution Time: {total_time:.2f} seconds")
print(f"Total Execution Time: {total_time / 60:.2f} minutes")


Decision Tree Training Time on All Statistical Feature Set  aand 
Decision Tree Testing Time All Statistical Feature Set 


In [None]:
import os
import numpy as np
import pandas as pd
import time  # Import time module for measuring execution time
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the directory containing the CSV files
stat_feature_dir = r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\Results on just chest region segmented dataset\statistical_features_just_for_time_Calculation'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "train_stat_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "val_stat_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "test_stat_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Initialize the Decision Tree Classifier with specified hyperparameters
clf = DecisionTreeClassifier(
    criterion='entropy',      # Use entropy for splitting nodes
    max_depth=5,              # Limit the tree depth to 5 levels
    min_samples_leaf=4,       # Minimum samples required for a leaf node
    min_samples_split=2,      # Minimum samples required to split an internal node
    random_state=0
)

# ====================== TRAINING PHASE ======================
print("Training the classifier...")

# Start training timer
train_start_time = time.time()

# Train the classifier
clf.fit(combined_features, combined_labels)

# End training timer
train_end_time = time.time()

# Calculate total training time
total_train_time = train_end_time - train_start_time

print(f"\nTotal Training Time: {total_train_time:.4f} seconds ({total_train_time / 60:.4f} minutes)")

# ====================== EVALUATE ON COMBINED TRAINING SET ======================
train_predictions = clf.predict(combined_features)
train_accuracy = accuracy_score(combined_labels, train_predictions)

print(f"\nCombined Training Accuracy: {train_accuracy * 100:.4f}%")
print(f"Combined Training Confusion Matrix:\n", confusion_matrix(combined_labels, train_predictions))

# Function to print classification report with four decimal points
def print_classification_report(set_name, labels, predictions):
    report = classification_report(labels, predictions, output_dict=True)
    print(f"{set_name} Classification Report:")
    for label, metrics in report.items():
        if label == 'accuracy':
            print(f"  Accuracy: {metrics:.4f}")
        else:
            print(f"  Class {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
    print()

print_classification_report("Combined Training", combined_labels, train_predictions)

# ====================== TESTING PHASE ======================
print("Testing the classifier...")

# Start testing timer
test_start_time = time.time()

# Evaluate on the test set
test_predictions = clf.predict(test_stat_features)

# End testing timer
test_end_time = time.time()

# Calculate total testing time
total_test_time = test_end_time - test_start_time

test_accuracy = accuracy_score(test_labels, test_predictions)

print(f"\nTest Accuracy: {test_accuracy * 100:.4f}%")
print(f"Test Confusion Matrix:\n", confusion_matrix(test_labels, test_predictions))

print_classification_report("Test", test_labels, test_predictions)

print(f"\nTotal Testing Time: {total_test_time:.4f} seconds ({total_test_time / 60:.4f} minutes)")

# ====================== SUMMARY ======================
print("\n========== SUMMARY ==========")
print(f"Training Time   : {total_train_time:.4f} seconds ({total_train_time / 60:.4f} minutes)")
print(f"Testing Time    : {total_test_time:.4f} seconds ({total_test_time / 60:.4f} minutes)")
print("================================\n")


RuleFit Training Time All Statistical Feature Set  and
RuleFit Testing Time All Statistical Feature Set  


In [None]:
import os
import pandas as pd
import numpy as np
import time
from rulefit import RuleFit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ==========================
# Load Dataset CSVs
# ==========================
input_dir = r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\Results on just chest region segmented dataset\statistical_features_just_for_time_Calculation'

train_data = pd.read_csv(os.path.join(input_dir, "train_stat_features.csv"))
val_data = pd.read_csv(os.path.join(input_dir, "val_stat_features.csv"))
test_data = pd.read_csv(os.path.join(input_dir, "test_stat_features.csv"))

# ==========================
# Separate Features and Labels
# ==========================
train_features = train_data.drop(columns=['label']).values
train_labels = train_data['label'].values

val_features = val_data.drop(columns=['label']).values
val_labels = val_data['label'].values

test_features = test_data.drop(columns=['label']).values
test_labels = test_data['label'].values

# ==========================
# Combine Training + Validation Data
# ==========================
train_val_features = np.vstack([train_features, val_features])
train_val_labels = np.hstack([train_labels, val_labels])

print(f"\nCombined Training + Validation Features Shape: {train_val_features.shape}")
print(f"Combined Training + Validation Labels Shape: {train_val_labels.shape}")

# ==========================
# Define Feature Names Dynamically
# ==========================
feature_names = train_data.columns[:-1].tolist()  # Exclude 'label'

# ==========================
# Initialize RuleFit Model
# ==========================
rf = RuleFit(tree_size=3, sample_fract=0.7, max_rules=2000, random_state=42)

# ==========================
# Train the RuleFit Model & Measure Training Time
# ==========================
train_start_time = time.perf_counter()

rf.fit(train_val_features, train_val_labels, feature_names=feature_names)

train_end_time = time.perf_counter()
total_train_time = train_end_time - train_start_time

# ==========================
# Test the RuleFit Model & Measure Testing Time
# ==========================
test_start_time = time.perf_counter()

# Predict on the test set (continuous values)
test_predictions = rf.predict(test_features)

test_end_time = time.perf_counter()
total_test_time = test_end_time - test_start_time

# ==========================
# Convert Predictions to Discrete Class Labels
# ==========================
test_predictions_discrete = np.round(test_predictions).astype(int)

# Ensure predicted labels are within the valid range of classes
test_predictions_discrete = np.clip(test_predictions_discrete, np.min(train_val_labels), np.max(train_val_labels))

# ==========================
# Calculate Accuracy and Metrics
# ==========================
test_accuracy = accuracy_score(test_labels, test_predictions_discrete)
print(f"\nTesting Accuracy: {test_accuracy:.4f}")

conf_matrix = confusion_matrix(test_labels, test_predictions_discrete)
print("\nConfusion Matrix:")
print(conf_matrix)

report = classification_report(test_labels, test_predictions_discrete, digits=4)
print("\nClassification Report:")
print(report)

# ==========================
# Extract and Save Rules
# ==========================
rules = rf.get_rules()
rules = rules[rules.coef != 0].sort_values("importance", ascending=False)

print("\nTop Rules from RuleFit Model:")
print(rules.head())

output_file_path = r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\lung_colon_image_set\26 features results\rulefit_rules_on_statistical_features_just_for_Time_Calculation.txt'
rules.to_csv(output_file_path, index=False)
print(f"\nRules have been saved to: {output_file_path}")

# ==========================
# Display Training and Testing Time
# ==========================
print("\n============================================")
print(f"Total Training Time: {total_train_time:.8f} seconds ({total_train_time / 60:.8f} minutes)")
print(f"Total Testing Time : {total_test_time:.8f} seconds ({total_test_time / 60:.8f} minutes)")
print("============================================\n")


SFMOV Time

In [None]:
import numpy as np
import os
import cv2
import time
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from scipy.stats import skew
from skimage.measure import shannon_entropy
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd
from pathlib import Path

# --------------------- Step 1: Load the MobileNetV2 Model ---------------------
model_path = r"E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\Mobilenetv2_finetuned_on_breastcancer_dataset.h5"

print("Loading model...")
loaded_model = load_model(model_path)
print("Model loaded successfully.")

# Define convolutional and dense layers for feature extraction
last_conv_layer_name = 'Conv_1_bn'
dense_layer_name = 'dense_1'

conv_layer_model = Model(inputs=loaded_model.input, outputs=loaded_model.get_layer(last_conv_layer_name).output)
dense_layer_model = Model(inputs=loaded_model.input, outputs=loaded_model.get_layer(dense_layer_name).output)

# --------------------- Step 2: Define dataset and output directories ---------------------
image_dir = r"E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\Results on just chest region segmented dataset\justchest_Unet_Segmented_Dataset"
output_dir = r"E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\Results on just chest region segmented dataset\eye Confusion Matrix visualization just for TC"

os.makedirs(output_dir, exist_ok=True)

# --------------------- Step 3: Load and preprocess dataset ---------------------

# Find all image paths
print("Loading images from directory:", image_dir)
filepaths = list(Path(image_dir).glob('**/*.png'))

# Check if images are found
if not filepaths:
    print(f"No images found in {image_dir}. Please check the folder path.")
    exit()

# Extract labels from folder names (parent folder of each file)
labels = [Path(fp).parent.name for fp in filepaths]

# Convert to pandas DataFrame
filepaths = pd.Series([str(fp) for fp in filepaths], name='Filepath')
labels = pd.Series(labels, name='Label')

# Combine into a DataFrame
image_df = pd.concat([filepaths, labels], axis=1)

# Print dataset info
print(f"Total images found: {len(image_df)}")
print(image_df['Label'].value_counts())

# Shuffle dataset
image_df = image_df.sample(frac=1).reset_index(drop=True)

# Check minimum sample size for splitting
if len(image_df) < 3:
    print("Not enough data to split. You need at least 3 images.")
    exit()

# --------------------- Step 4: Split the dataset ---------------------
train_df, temp_df = train_test_split(image_df, train_size=0.7, shuffle=True, random_state=1)
val_df, test_df = train_test_split(temp_df, train_size=0.5, shuffle=True, random_state=1)

print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

# --------------------- Step 5: Create test data generator ---------------------
def create_test_data_generator(image_df, batch_size=32):
    datagen = ImageDataGenerator(rescale=1.0 / 255.0)
    generator = datagen.flow_from_dataframe(
        dataframe=image_df,
        x_col='Filepath',
        y_col=None,
        target_size=(224, 224),
        batch_size=batch_size,
        class_mode=None,
        shuffle=False
    )
    return generator

test_generator = create_test_data_generator(test_df, batch_size=32)

# --------------------- Step 6: Make predictions on the test set ---------------------
predictions = loaded_model.predict(test_generator, steps=len(test_generator), verbose=1)
predicted_classes = np.argmax(predictions, axis=1)

# --------------------- Step 7: Create and save confusion matrix ---------------------
# Folder mapping for class indices
class_folders = {
    0: "COVID",
    1: "Lung_Opacity",
    2: "Normal",
    3: "Viral Pneumonia"
}

# Map true labels to class indices
true_classes = test_df['Label'].map(lambda x: list(class_folders.values()).index(x)).values

# Compute confusion matrix
cm = confusion_matrix(true_classes, predicted_classes)

plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()

tick_marks = np.arange(len(class_folders))
plt.xticks(tick_marks, class_folders.values(), rotation=45)
plt.yticks(tick_marks, class_folders.values())

cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
thresh = cm_normalized.max() / 2.

for i, j in np.ndindex(cm_normalized.shape):
    plt.text(j, i, f"{cm_normalized[i, j]:.2f}",
             horizontalalignment="center",
             color="white" if cm_normalized[i, j] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()

# Save the confusion matrix plot
plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
plt.close()

print("Confusion matrix saved to:", output_dir)

# --------------------- Step 8: Process and visualize a single image with timing ---------------------
def process_image_with_timing(image_path, output_name):
    start_time = time.time()

    input_size = (224, 224)
    original_image = load_img(image_path, target_size=input_size)
    image_array = img_to_array(original_image) / 255.0
    image_array = np.expand_dims(image_array, axis=0)

    # Extract features
    conv_features = np.squeeze(conv_layer_model.predict(image_array))
    dense_features = dense_layer_model.predict(image_array)

    dense_mean = np.mean(dense_features)
    dense_skewness = skew(dense_features.ravel())
    dense_entropy = shannon_entropy(dense_features)

    # Statistical maps
    mean_map = np.zeros(conv_features.shape[:2])
    skewness_map = np.zeros(conv_features.shape[:2])
    entropy_map = np.zeros(conv_features.shape[:2])

    for i in range(conv_features.shape[-1]):
        feature_map = conv_features[:, :, i]
        mean_map += feature_map / conv_features.shape[-1]
        skewness_map += skew(feature_map.ravel()) * feature_map
        entropy_map += shannon_entropy(feature_map) * feature_map

    # Normalize maps
    mean_map = (mean_map - mean_map.min()) / (mean_map.max() - mean_map.min() + 1e-8)
    skewness_map = (skewness_map - skewness_map.min()) / (skewness_map.max() - skewness_map.min() + 1e-8)
    entropy_map = (entropy_map - entropy_map.min()) / (entropy_map.max() - entropy_map.min() + 1e-8)

    # Combine maps
    combined_map = (dense_mean * mean_map + dense_skewness * skewness_map + dense_entropy * entropy_map)
    combined_map /= (dense_mean + dense_skewness + dense_entropy)
    combined_map = (combined_map - combined_map.min()) / (combined_map.max() - combined_map.min() + 1e-8)

    # Resize maps for plotting
    mean_map_resized = cv2.resize(mean_map, input_size)
    skewness_map_resized = cv2.resize(skewness_map, input_size)
    entropy_map_resized = cv2.resize(entropy_map, input_size)
    combined_map_resized = cv2.resize(combined_map, input_size)

    # Plot and save
    fig, ax = plt.subplots(1, 5, figsize=(22, 6))

    ax[0].imshow(original_image)
    ax[0].set_title('Original Image')

    ax[1].imshow(original_image)
    ax[1].imshow(mean_map_resized, cmap='jet', alpha=0.5)
    ax[1].set_title('Mean Heatmap')

    ax[2].imshow(original_image)
    ax[2].imshow(skewness_map_resized, cmap='jet', alpha=0.5)
    ax[2].set_title('Skewness Heatmap')

    ax[3].imshow(original_image)
    ax[3].imshow(entropy_map_resized, cmap='jet', alpha=0.5)
    ax[3].set_title('Entropy Heatmap')

    ax[4].imshow(original_image)
    ax[4].imshow(combined_map_resized, cmap='jet', alpha=0.5)
    ax[4].set_title('Combined Heatmap')

    for a in ax:
        a.axis('off')

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, output_name))
    plt.close()

    end_time = time.time()
    time_taken = end_time - start_time
    print(f"Time taken to process and visualize the image '{output_name}': {time_taken:.2f} seconds")

# --------------------- Step 9: Run visualization on one image ---------------------
if len(test_df) > 0:
    sample_image_path = test_df.iloc[0]['Filepath']
    print(f"Processing image: {sample_image_path}")
    process_image_with_timing(sample_image_path, 'sample_image_visualization.png')
else:
    print("No images found in the test set to process.")

print("Visualization and processing completed successfully!")


Deep learning model Grid search optimization code 

In [None]:
# import numpy as np
# import pandas as pd
# from pathlib import Path
# import os
# import tensorflow as tf
# from tensorflow.keras import layers, Model
# from tensorflow.keras.preprocessing.image import ImageDataGenerator
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# import time  # <-- Added for timing the process

# # Load and preprocess dataset
# image_dir = Path(r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\testing on Generated Eye Dataset for Glaucoma Detection\Acrima')
# filepaths = list(image_dir.glob(r'**/*.png'))
# labels = list(map(lambda x: os.path.split(os.path.split(x)[0])[1], filepaths))

# # Convert file paths and labels to a DataFrame
# filepaths = pd.Series(filepaths, name='Filepath').astype(str)
# labels = pd.Series(labels, name='Label')
# image_df = pd.concat([filepaths, labels], axis=1).sample(frac=1).reset_index(drop=True)

# # Split data into train, validation, and test sets
# train_df, temp_df = train_test_split(image_df, train_size=0.7, shuffle=True, random_state=1)
# val_df, test_df = train_test_split(temp_df, train_size=0.5, shuffle=True, random_state=1)

# # Create data generators
# def create_data_generators(train_df, val_df, test_df, batch_size):
#     train_gen = ImageDataGenerator(
#         preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input,
#         rotation_range=20, width_shift_range=0.2, height_shift_range=0.2,
#         shear_range=0.2, zoom_range=0.2, horizontal_flip=True, fill_mode='nearest'
#     )
#     val_test_gen = ImageDataGenerator(
#         preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
#     )

#     train_images = train_gen.flow_from_dataframe(train_df, x_col='Filepath', y_col='Label',
#                                                  target_size=(224, 224), color_mode='rgb', class_mode='categorical',
#                                                  batch_size=batch_size, shuffle=True, seed=0)
#     val_images = val_test_gen.flow_from_dataframe(val_df, x_col='Filepath', y_col='Label',
#                                                   target_size=(224, 224), color_mode='rgb', class_mode='categorical',
#                                                   batch_size=batch_size, shuffle=False)
#     test_images = val_test_gen.flow_from_dataframe(test_df, x_col='Filepath', y_col='Label',
#                                                    target_size=(224, 224), color_mode='rgb', class_mode='categorical',
#                                                    batch_size=batch_size, shuffle=False)
#     return train_images, val_images, test_images

# # Model-building function
# def build_model(learning_rate=0.001):
#     pretrained_model = tf.keras.applications.MobileNetV2(input_shape=(224, 224, 3), include_top=False, pooling='avg', weights='imagenet')
#     pretrained_model.trainable = True
#     for layer in pretrained_model.layers[:-50]:  # Fine-tune last 50 layers
#         layer.trainable = False

#     model = tf.keras.Sequential([
#         pretrained_model,
#         layers.Dense(256, activation='relu'),
#         layers.BatchNormalization(),
#         layers.Dropout(0.4),
#         layers.Dense(128, activation='relu'),
#         layers.BatchNormalization(),
#         layers.Dropout(0.3),
#         layers.Dense(len(train_df['Label'].unique()), activation='softmax')
#     ])

#     model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
#                   loss='categorical_crossentropy',
#                   metrics=['accuracy'])
#     return model

# # Hyperparameter grid
# param_grid = {
#     'learning_rate': [0.001, 0.005, 0.01, 0.02],
#     'epochs': [10, 30, 50],
#     'batch_size': [16, 32, 64, 128]
# }

# # Grid search
# best_accuracy = 0
# best_params = None

# # Generate data once outside the loop for consistent train/val/test splits
# train_images, val_images, test_images = create_data_generators(train_df, val_df, test_df, batch_size=32)

# # Start timer
# start_time = time.time()

# # Hyperparameter tuning loop
# for lr in param_grid['learning_rate']:
#     for epochs in param_grid['epochs']:
#         for batch_size in param_grid['batch_size']:
#             print(f"Training with lr={lr}, epochs={epochs}, batch_size={batch_size}")

#             # Re-create data generators for current batch_size
#             train_images, val_images, test_images = create_data_generators(train_df, val_df, test_df, batch_size=batch_size)

#             # Build and train model
#             model = build_model(learning_rate=lr)
#             history = model.fit(train_images, epochs=epochs, validation_data=val_images, verbose=0)

#             # Evaluate on validation set
#             val_loss, val_accuracy = model.evaluate(val_images, verbose=0)
#             print(f"Validation accuracy: {val_accuracy:.4f}")

#             # Track best parameters
#             if val_accuracy > best_accuracy:
#                 best_accuracy = val_accuracy
#                 best_params = {'learning_rate': lr, 'epochs': epochs, 'batch_size': batch_size}

# # End timer
# end_time = time.time()
# total_time = end_time - start_time

# # Convert time to hours, minutes, seconds
# hours, rem = divmod(total_time, 3600)
# minutes, seconds = divmod(rem, 60)

# print(f"\nGrid Search Completed!")
# print(f"Total Time: {total_time:.2f} seconds ({int(hours)}h {int(minutes)}m {int(seconds)}s)\n")
# print(f"Best Hyperparameters: {best_params}")
# print(f"Best Validation Accuracy: {best_accuracy:.4f}")

# # Final evaluation on test set
# best_model = build_model(learning_rate=best_params['learning_rate'])
# best_model.fit(train_images, epochs=best_params['epochs'], batch_size=best_params['batch_size'], verbose=0)
# test_loss, test_acc = best_model.evaluate(test_images, verbose=0)
# print(f"Test Loss: {test_loss:.5f}, Test Accuracy: {test_acc:.2f}")


grid search for DT model

In [None]:
import os
import numpy as np
import pandas as pd
import time  # Import the time module

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define parameter grid for Decision Tree
dt_param_grid = {
    'max_depth': [5, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Define the directory containing the CSV files
stat_feature_dir = r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\Results on just chest region segmented dataset\statistical_features_just_for_time_Calculation'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "train_stat_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

# Initialize Decision Tree Classifier
dt_clf = DecisionTreeClassifier(random_state=0)

# Use GridSearchCV for Decision Tree with a limited number of folds (e.g., 3-fold) to save time
dt_grid_search = GridSearchCV(
    estimator=dt_clf,
    param_grid=dt_param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1
)

# Track time before and after the grid search
start_time = time.time()

# Fit the grid search model on training data
dt_grid_search.fit(train_stat_features, train_labels)

end_time = time.time()

# Calculate total time taken in seconds
total_time = end_time - start_time

# Retrieve the best parameters and accuracy
best_dt_params = dt_grid_search.best_params_
best_dt_score = dt_grid_search.best_score_

# Print results
print(f"\nBest Decision Tree Parameters: {best_dt_params}")
print(f"Best Decision Tree Cross-Validation Accuracy: {best_dt_score:.4f}")
print(f"\nTotal time taken by Grid Search: {total_time:.2f} seconds")

# Optional: display time in minutes and seconds
minutes = int(total_time // 60)
seconds = int(total_time % 60)
print(f"Total time taken: {minutes} minute(s) and {seconds} second(s)")


Grid Search Time for RuleFit

In [None]:
import os
import time  # <-- Added to track execution time
from rulefit import RuleFit
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

# Define the directory containing the CSV files
stat_feature_dir = r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\Results on just chest region segmented dataset\statistical_features_just_for_time_Calculation'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "train_stat_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "val_stat_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "test_stat_features.csv"))

# Extract feature names (column names except 'label')
feature_names = train_df.columns.drop('label').tolist()

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Define parameter grid for RuleFit
rulefit_param_grid = {
    'tree_size': [3, 5, 7],
    'sample_fract': [0.7, 0.8, 1.0],
    'max_rules': [25, 100, 200, 500, 1000, 2000]
}

# Variables to store best parameters and score
best_rf_params = None
best_rf_score = 0

# Start timing the grid search
start_time = time.time()

# Custom grid search for RuleFit
for tree_size in rulefit_param_grid['tree_size']:
    for sample_fract in rulefit_param_grid['sample_fract']:
        for max_rules in rulefit_param_grid['max_rules']:
            # Initialize RuleFit with the current set of parameters
            rf = RuleFit(tree_size=tree_size, sample_fract=sample_fract, max_rules=max_rules, random_state=42)
            
            # Fit the model on training data
            rf.fit(train_stat_features, train_labels, feature_names=feature_names)
            
            # Predict on the validation set
            val_predictions = rf.predict(val_stat_features)
            
            # Convert continuous predictions to discrete labels
            val_predictions_discrete = np.round(val_predictions).astype(int)
            
            # Clip predictions to ensure they stay within the label range
            val_predictions_discrete = np.clip(val_predictions_discrete, np.min(train_labels), np.max(train_labels))
            
            # Calculate accuracy on the validation set
            accuracy = accuracy_score(val_labels, val_predictions_discrete)
            
            # Print parameters and validation accuracy for current iteration
            print(f"Tree Size: {tree_size}, Sample Fract: {sample_fract}, Max Rules: {max_rules}, Validation Accuracy: {accuracy:.4f}")
            
            # Update best score and parameters if the current is better
            if accuracy > best_rf_score:
                best_rf_score = accuracy
                best_rf_params = {
                    'tree_size': tree_size,
                    'sample_fract': sample_fract,
                    'max_rules': max_rules
                }

# End timing the grid search
end_time = time.time()

# Calculate total time taken in seconds
total_time = end_time - start_time

# Print best parameters and accuracy
print("\nBest RuleFit Parameters:", best_rf_params)
print(f"Best RuleFit Validation Accuracy: {best_rf_score:.4f}")

# Print total time taken by the grid search
print(f"\nTotal time taken by Grid Search: {total_time:.2f} seconds")

# Optional: display time in minutes and seconds
minutes = int(total_time // 60)
seconds = int(total_time % 60)
print(f"Total time taken: {minutes} minute(s) and {seconds} second(s)")


RuleFit rule extraction Code 

In [None]:
import os
import time  # Added for timing
import pandas as pd
import numpy as np
from rulefit import RuleFit

def simplify_rule(rule):
    """
    Simplifies a rule string by removing logically redundant conditions
    and ensuring the rule format is valid.
    """
    try:
        conditions = rule.split(" & ")
        parsed_conditions = []
        for cond in conditions:
            parts = cond.split()
            if len(parts) < 3:
                continue
            feature, operator, value = parts[:3]
            value = float(value)
            parsed_conditions.append((feature, operator, value))

        simplified_conditions = {}
        for feature, operator, value in parsed_conditions:
            if feature not in simplified_conditions:
                simplified_conditions[feature] = (operator, value)
            else:
                current_operator, current_value = simplified_conditions[feature]
                if operator == ">" and value > current_value:
                    simplified_conditions[feature] = (operator, value)
                elif operator == "<=" and value < current_value:
                    simplified_conditions[feature] = (operator, value)

        return " & ".join([f"{feature} {operator} {value}" for feature, (operator, value) in simplified_conditions.items()])
    except Exception as e:
        print(f"Error simplifying rule: {rule}. Error: {e}")
        return rule

# Start timer here!
start_time = time.time()

# Load Data
input_dir = r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\26 features results\3 best features'
train_df = pd.read_csv(os.path.join(input_dir, "3_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(input_dir, "3_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "3_testing_selected_features.csv"))

# Combine Training and Validation Data
train_val_df = pd.concat([train_df, val_df], ignore_index=True)
X_train_val = train_val_df.drop(columns=["label"]).values
y_train_val = train_val_df["label"].values
feature_names = train_val_df.columns[:-1].tolist()

# Train RuleFit Model
rulefit_model = RuleFit(tree_size=4, sample_fract=0.7, max_rules=200, random_state=42)
rulefit_model.fit(X_train_val, y_train_val, feature_names=feature_names)

# Extract Rules
rules = rulefit_model.get_rules()
rules = rules[rules.coef != 0]  # Filter rules with non-zero coefficients

if rules.empty:
    print("No rules were generated. Check your data or model configuration.")
else:
    # Simplify Rules
    rules["rule"] = rules["rule"].apply(simplify_rule)

    # Get Unique Classes
    unique_classes = sorted(set(y_train_val))

    # Display and Save Rules
    print("\nSimplified Top Rules in If-Then Format for All Classes:")
    if_then_rules = []
    
    for label in unique_classes:
        # Filter top rules for each class by importance scores and class association
        class_rules = rules[rules.apply(lambda x: np.argmax(x['coef']) == label if isinstance(x['coef'], np.ndarray) else x['coef'] > 0, axis=1)]
        class_rules = class_rules.sort_values(by="importance", ascending=False).head(20)  # Top N rules per class

        print(f"\nClass {label} Rules:")
        for _, row in class_rules.iterrows():
            rule_str = f"If ({row['rule']}) then Class = {label} (Importance: {row['importance']:.4f})"
            print(rule_str)
            if_then_rules.append(rule_str)

    # Save Rules to File
    output_file_path = "simplified_rulefit_top_if_then_rules_for_all_classes.txt"
    with open(output_file_path, 'w') as f:
        for rule in if_then_rules:
            f.write(rule + "\n")
    print(f"\nSimplified If-Then rules for all classes have been saved to {output_file_path}.")

# End timer here!
end_time = time.time()
total_time_seconds = end_time - start_time
total_time_minutes = total_time_seconds / 60

print(f"\n✅ Total time taken for RuleFit training, rule extraction, simplification, and saving: {total_time_seconds:.2f} seconds ({total_time_minutes:.2f} minutes).")


Decision tree rules 

In [None]:
import os
import numpy as np
import pandas as pd
import time  # Added for measuring time
from sklearn.tree import DecisionTreeClassifier

# Define the directory containing the CSV files
stat_feature_dir = r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\26 features results\3 best features'

# Load the CSV files
train_df = pd.read_csv(os.path.join(stat_feature_dir, "3_training_selected_features.csv"))
val_df = pd.read_csv(os.path.join(stat_feature_dir, "3_validation_selected_features.csv"))
test_df = pd.read_csv(os.path.join(stat_feature_dir, "3_testing_selected_features.csv"))

# Separate features and labels
train_stat_features = train_df.drop(columns=['label']).values
train_labels = train_df['label'].values

val_stat_features = val_df.drop(columns=['label']).values
val_labels = val_df['label'].values

test_stat_features = test_df.drop(columns=['label']).values
test_labels = test_df['label'].values

# Combine training and validation data for final training
combined_features = np.vstack([train_stat_features, val_stat_features])
combined_labels = np.hstack([train_labels, val_labels])

# Start timing for decision tree training and rule extraction
start_time = time.time()

# Train Decision Tree Classifier with specified hyperparameters
clf = DecisionTreeClassifier(
    criterion="entropy",
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=4,
    random_state=0
)
clf.fit(combined_features, combined_labels)

# Extract rules in the form of "If ... then ..." with importance
feature_names = list(train_df.columns[:-1])
rules = []

def traverse_tree(tree, feature_names, feature_importances, node=0, conditions="", path_importance=0):
    """
    Recursively traverse the decision tree to extract rules and calculate their importance.
    """
    # Check if this is a leaf node
    if tree.children_left[node] == -1 and tree.children_right[node] == -1:
        # Leaf node, output the class prediction and importance
        class_value = np.argmax(tree.value[node][0])
        rule_importance = path_importance
        rule = f"If ({conditions.rstrip(' & ')}) then Class = {class_value} (Importance: {rule_importance:.4f})"
        rules.append(rule)
    else:
        # Internal node, calculate feature importance for this path
        feature_index = tree.feature[node]
        threshold = tree.threshold[node]

        # Avoid invalid indices
        if feature_index >= 0:
            # Update path importance using the feature importance of the current feature
            path_importance += feature_importances[feature_index]

            # Left child (feature <= threshold)
            left_conditions = conditions + f"{feature_names[feature_index]} <= {threshold:.3f} & "
            traverse_tree(tree, feature_names, feature_importances, tree.children_left[node], left_conditions, path_importance)

            # Right child (feature > threshold)
            right_conditions = conditions + f"{feature_names[feature_index]} > {threshold:.3f} & "
            traverse_tree(tree, feature_names, feature_importances, tree.children_right[node], right_conditions, path_importance)

# Traverse the tree to extract rules
try:
    traverse_tree(clf.tree_, feature_names, clf.feature_importances_)
except Exception as e:
    print(f"Error during tree traversal: {e}")

# End timing after rule extraction
end_time = time.time()

# Calculate the total time taken
total_time = end_time - start_time
print(f"\nTotal time taken for Decision Tree training and rule extraction: {total_time:.4f} seconds\n")

# Simplify a single rule by merging redundant conditions
def simplify_rule(rule):
    conditions_part, result_part = rule.split(") then")
    conditions = conditions_part.replace("If (", "").split(" & ")
    simplified_conditions = {}

    # Extract feature and threshold for each condition
    for condition in conditions:
        feature, operator, threshold = condition.split(" ")
        threshold = float(threshold)

        # Simplify by keeping the most restrictive range
        if feature not in simplified_conditions:
            simplified_conditions[feature] = {"<=": float('inf'), ">": float('-inf')}
        if operator == "<=":
            simplified_conditions[feature]["<="] = min(simplified_conditions[feature]["<="], threshold)
        elif operator == ">":
            simplified_conditions[feature][">"] = max(simplified_conditions[feature][">"], threshold)

    # Reconstruct simplified conditions
    final_conditions = []
    for feature, thresholds in simplified_conditions.items():
        if thresholds[">"] != float('-inf'):
            final_conditions.append(f"{feature} > {thresholds['>']:.3f}")
        if thresholds["<="] != float('inf'):
            final_conditions.append(f"{feature} <= {thresholds['<=']:.3f}")

    return f"If ({' & '.join(final_conditions)}) then{result_part}"

# Simplify all rules
simplified_rules = [simplify_rule(rule) for rule in rules]

# Extract importance scores from simplified rules and sort them in descending order
def extract_importance(rule):
    importance_start = rule.find("(Importance: ") + len("(Importance: ")
    importance_end = rule.find(")", importance_start)
    return float(rule[importance_start:importance_end])

# Sort rules by importance in descending order
sorted_rules = sorted(simplified_rules, key=extract_importance, reverse=True)

# Display sorted and simplified rules
print("Simplified and Sorted Rules:")
for rule in sorted_rules:
    print(rule)

# Save sorted and simplified rules to a file
output_file_path = r'E:\Abroad period research\Medical images analysis paper implementation codes\Second part of the paper\Ultrasound Breast Images for Breast Cancer\26 features results\3 best features\simplified_sorted_rules.txt'
with open(output_file_path, 'w') as f:
    for rule in sorted_rules:
        f.write(rule + "\n")

print(f"\nSimplified and sorted rules have been saved to {output_file_path}.")
