In [None]:
import os
import pandas as pd

# Paths to feature folders (excluding HOG features)
feature_folders = [
    r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\Features\Frequency",
    r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\Features\Gabor",
    r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\Features\HOG",
    r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\Features\Statistical",
    r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\Features\LBP_Features"
  ]

# Output folder to save concatenated features
output_folder = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\Features"
os.makedirs(output_folder, exist_ok=True)

# Names of files to concatenate
file_names = ["train.csv", "val.csv", "test.csv"]

# Function to concatenate features from multiple folders for a specific file type (train, val, test)
def concatenate_features(file_name, feature_folders, output_folder):
    combined_df = None  # Initialize an empty DataFrame
    
    for folder_path in feature_folders:
        file_path = os.path.join(folder_path, file_name)
        
        if os.path.exists(file_path):
            # Read the feature file
            df = pd.read_csv(file_path)
            
            # Check if 'label' exists in the DataFrame
            if 'label' in df.columns:
                if combined_df is None:  # First folder, keep 'label'
                    combined_df = df
                else:  # Drop 'label' in subsequent folders to avoid duplicates
                    combined_df = pd.concat([combined_df, df.drop('label', axis=1)], axis=1)
            else:
                if combined_df is None:  # If no label column, set the first combined_df
                    combined_df = df
                else:
                    combined_df = pd.concat([combined_df, df], axis=1)
        else:
            print(f"File not found: {file_path}")
    
    # Ensure the 'label' column is included as the last column
    if 'label' in combined_df.columns:
        label = combined_df['label']
        combined_df = combined_df.drop(columns=['label'])
        combined_df['label'] = label  # Move 'label' to the end

    # Save the concatenated features to the output folder
    output_path = os.path.join(output_folder, file_name)
    combined_df.to_csv(output_path, index=False)
    print(f"Concatenated {file_name} saved to: {output_path}")

# Loop through train, val, and test files
for file_name in file_names:
    concatenate_features(file_name, feature_folders, output_folder)

print("Feature concatenation complete.")


Classification using decision tree

In [None]:


import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

# Paths to the combined feature files
combined_features_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\Features"

# Load train, val, and test datasets
try:
    train_df = pd.read_csv(os.path.join(combined_features_path, "train.csv"))
    val_df = pd.read_csv(os.path.join(combined_features_path, "val.csv"))
    test_df = pd.read_csv(os.path.join(combined_features_path, "test.csv"))
except FileNotFoundError as e:
    print(f"Error loading files: {e}")
    raise

# Combine the train and val datasets
combined_train_val_df = pd.concat([train_df, val_df], ignore_index=True)

# Ensure label column exists in both combined train-val and test datasets
if 'label' not in combined_train_val_df.columns or 'label' not in test_df.columns:
    raise ValueError("The 'label' column is missing in one of the datasets.")

# Split the features (X) and labels (y) for the training-validation set
X_train_val = combined_train_val_df.drop(columns=['label'])  # Features
y_train_val = combined_train_val_df['label']  # Labels

# Split the features (X) and labels (y) for the test set
X_test = test_df.drop(columns=['label'])  # Features
y_test = test_df['label']  # Labels

# Check for any missing values in the datasets
if X_train_val.isnull().values.any() or X_test.isnull().values.any():
    print("Warning: Missing values found in features. Please handle missing data before training.")

# ---- Grid Search for Hyperparameter Optimization ----
# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=1)

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, 
                           scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train_val, y_train_val)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Optimized Hyperparameters:")
print(best_params)

# Get the best estimator from Grid Search
best_dt_classifier = grid_search.best_estimator_

# ---- Training Evaluation ----
# Make predictions on the training data
y_train_pred = best_dt_classifier.predict(X_train_val)

# Training confusion matrix
train_cm = confusion_matrix(y_train_val, y_train_pred)
print("Training Confusion Matrix:")
print(train_cm)

# Plot training confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(train_cm, annot=True, fmt='d', cmap='Greens', 
            xticklabels=[f'Class {i}' for i in range(len(set(y_train_val)))],
            yticklabels=[f'Class {i}' for i in range(len(set(y_train_val)))])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Training Confusion Matrix')
plt.show()

# Training classification report
print("\nTraining Classification Report:")
print(classification_report(y_train_val, y_train_pred, digits=4))

# Training accuracy
train_accuracy = accuracy_score(y_train_val, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f}")

# ---- Testing Evaluation ----
# Make predictions on the test data
y_test_pred = best_dt_classifier.predict(X_test)

# Testing confusion matrix
test_cm = confusion_matrix(y_test, y_test_pred)
print("Testing Confusion Matrix:")
print(test_cm)

# Plot testing confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(test_cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=[f'Class {i}' for i in range(len(set(y_test)))],
            yticklabels=[f'Class {i}' for i in range(len(set(y_test)))])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Testing Confusion Matrix')
plt.show()

# Testing classification report
print("\nTesting Classification Report:")
print(classification_report(y_test, y_test_pred, digits=4))

# Testing accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Testing Accuracy: {test_accuracy:.4f}")


