In [None]:
#  Step 0: Imports
import os
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer

#  Step 1: Load Feature Files
def load_feature_files(base_path):
    """Load train, validation, and test CSV files from each feature folder."""
    feature_sets = ["Frequency", "Gabor", "HOG", "Statistical"]
    data = {}

    for feature in feature_sets:
        train_path = os.path.join(base_path, feature, "train.csv")
        val_path = os.path.join(base_path, feature, "val.csv")
        test_path = os.path.join(base_path, feature, "test.csv")

        data[feature] = {
            "train": pd.read_csv(train_path),
            "val": pd.read_csv(val_path),
            "test": pd.read_csv(test_path),
        }

    return data

base_path = r"E:\Abroad period research\Feature Fusion paper\Ultrasound Breast Cancer\Features"
data = load_feature_files(base_path)

#  Step 2: Split Features and Labels
def split_features_and_labels(data):
    """Split features and labels for each feature set."""
    X_train, y_train = {}, {}
    X_val, y_val = {}, {}

    for feature, datasets in data.items():
        X_train[feature] = datasets["train"].iloc[:, :-1]
        y_train[feature] = datasets["train"].iloc[:, -1]
        X_val[feature] = datasets["val"].iloc[:, :-1]
        y_val[feature] = datasets["val"].iloc[:, -1]

    return X_train, y_train, X_val, y_val

X_train, y_train, X_val, y_val = split_features_and_labels(data)

#  Step 3: Class-Specific Feature Selection and Evaluation
def perform_class_specific_feature_selection(X_train, y_train, X_val, y_val, num_features=5):
    """
    Perform one-vs-all feature selection and train a Decision Tree for each class.
    """
    class_specific_features = {}
    class_classifiers = {}
    evaluation_results = {}

    # Use labels from any one feature set (assuming they're the same)
    classes = np.unique(y_train[next(iter(y_train))])

    for class_label in classes:
        print(f"\n Performing feature selection and training for class {class_label}...")

        # One-vs-all labels
        y_binary_train = y_train[next(iter(X_train))].apply(lambda x: 1 if x == class_label else 0)
        y_binary_val = y_val[next(iter(X_val))].apply(lambda x: 1 if x == class_label else 0)

        # Merge all feature sets
        X_train_combined = pd.concat([X_train[feature] for feature in X_train], axis=1)
        X_val_combined = pd.concat([X_val[feature] for feature in X_val], axis=1)

        # Handle missing values (NaNs) using SimpleImputer
        imputer = SimpleImputer(strategy="mean")
        X_train_imputed = imputer.fit_transform(X_train_combined)
        X_val_imputed = imputer.transform(X_val_combined)

        # Select top-k features
        selector = SelectKBest(score_func=mutual_info_classif, k=num_features)
        selector.fit(X_train_imputed, y_binary_train)

        selected_indices = selector.get_support(indices=True)
        selected_features = X_train_combined.columns[selected_indices]

        class_specific_features[class_label] = selected_features
        print(f" Selected features for class {class_label}: {list(selected_features)}")

        # Train model
        clf = DecisionTreeClassifier(max_depth=10, random_state=42)
        clf.fit(X_train_imputed[:, selected_indices], y_binary_train)

        # Evaluate model
        y_pred_val = clf.predict(X_val_imputed[:, selected_indices])
        accuracy = accuracy_score(y_binary_val, y_pred_val)
        report = classification_report(y_binary_val, y_pred_val, digits=4)

        class_classifiers[class_label] = clf
        evaluation_results[class_label] = {
            "accuracy": accuracy,
            "classification_report": report
        }

        print(f" Accuracy for class {class_label}: {accuracy:.4f}")
        print(f" Classification Report:\n{report}")

    return class_specific_features, class_classifiers, evaluation_results

#  Step 4: Run Feature Selection + Training + Evaluation
class_specific_features, class_classifiers, evaluation_results = perform_class_specific_feature_selection(
    X_train, y_train, X_val, y_val, num_features=5
)
