In [None]:
# ==============================================================================
# A Unique Framework for Autism Spectrum Disorder Detection
# Based on Improved BiSectional kMedoids Clustering and Ensemble Learning
#
# This notebook reproduces the experiments described in the research paper.
# ==============================================================================

# Section 4: Experimental Implementation - Library Imports
# ==============================================================================
import pandas as pd
import numpy as np
import os
from scipy.io import arff
from warnings import filterwarnings

# Pre-processing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Clustering Algorithms from scikit-learn and scikit-learn-extra
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import BisectingKMeans # CORRECTED IMPORT

# Classification Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import tensorflow as tf

# Ensemble Learning
from sklearn.ensemble import VotingClassifier

# Evaluation Metrics
from sklearn.metrics import accuracy_score, precision_score, mean_squared_error

# Suppress warnings for cleaner output
filterwarnings('ignore')

print("All libraries imported successfully.")
print("TensorFlow Version:", tf.__version__)

In [None]:
# ==============================================================================
# Section 3 & 4: Data Loading and Pre-processing
# ==============================================================================
def load_and_preprocess_data(arff_file_path):
    """
    Loads data from an ARFF file and applies the pre-processing steps
    described in the paper: missing value imputation, label encoding, and scaling.
    """
    data, meta = arff.loadarff(arff_file_path)
    df = pd.DataFrame(data)


    for col in df.select_dtypes(['object']).columns:
        df[col] = df[col].str.decode('utf-8')

    df.replace('?', np.nan, inplace=True)

    num_imputer = SimpleImputer(strategy='mean')
    cat_imputer = SimpleImputer(strategy='most_frequent')

    for col in df.columns:
        if df[col].dtype == 'float64' or df[col].dtype == 'int64':
            df[col] = num_imputer.fit_transform(df[[col]])
        else:
            imputed_data = cat_imputer.fit_transform(df[[col]])
            df[col] = pd.Series(imputed_data.flatten())

    le = LabelEncoder()
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = le.fit_transform(df[col])

    X = df.drop('Class/ASD', axis=1)
    y = df['Class/ASD']

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    return X_scaled, y

print("Data pre-processing function defined.")

In [None]:
# Dynamically find the ARFF file in the current directory
try:
    arff_file = [f for f in os.listdir('.') if f.endswith('.arff')][0]
    print(f"Loading and processing dataset: {arff_file}")
    X, y = load_and_preprocess_data(arff_file)

    # Split data for training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    print(f"\nData shapes:\nX_train: {X_train.shape}\ny_train: {y_train.shape}\nX_test: {X_test.shape}\ny_test: {y_test.shape}")
    print("\nSample of pre-processed data:")
    display(X.head())
except IndexError:
    print("ERROR: Could not find the .arff dataset file in this directory.")
    print("Please make sure the dataset file (e.g., 'Autism-Child-Data.arff') is in the same folder as this notebook.")

In [None]:
# ==============================================================================
# Section 4: Xie-Beni Index Implementation
# ==============================================================================
def xie_beni_index(X, labels, centers):
    """Calculates the Xie-Beni index for a given clustering result."""
    n_samples = X.shape[0]
    n_clusters = len(centers)
    
    compactness = np.sum([np.sum((X[labels == i] - centers[i])**2) for i in range(n_clusters)])
    compactness /= n_samples

    min_separation = np.inf
    if n_clusters > 1:
        for i in range(n_clusters):
            for j in range(i + 1, n_clusters):
                dist = np.linalg.norm(centers[i] - centers[j])**2
                if dist < min_separation:
                    min_separation = dist
    else:
        return np.inf

    if min_separation == 0:
        return np.inf

    return compactness / min_separation

def find_optimal_k(X, clusterer_class, max_k=10):
    """Finds the optimal 'k' by minimizing the Xie-Beni index."""
    best_k = 2
    min_xb_index = np.inf
    print(f"\nFinding optimal 'k' for {clusterer_class.__name__} using Xie-Beni Index...")
    for k in range(2, max_k + 1):
        # Handle different clusterer APIs
        if 'KMedoids' in clusterer_class.__name__:
             clusterer = clusterer_class(n_clusters=k, random_state=42, method='pam')
             labels = clusterer.fit_predict(X)
             centers = clusterer.cluster_centers_
        else: # For KMeans and BisectingKMeans
            clusterer = clusterer_class(n_clusters=k, random_state=42, n_init=10)
            labels = clusterer.fit_predict(X)
            centers = clusterer.cluster_centers_

        xb_index = xie_beni_index(X.to_numpy(), labels, centers)
        print(f"  k={k}, Xie-Beni Index = {xb_index:.4f}")
        if xb_index < min_xb_index:
            min_xb_index = xb_index
            best_k = k
    
    print(f"Optimal 'k' found: {best_k} with Xie-Beni Index: {min_xb_index:.4f}")
    return best_k

print("Xie-Beni Index helper functions defined.")

In [None]:
# ==============================================================================
# Section 4: Defining Classification Models
# ==============================================================================
def create_ann_model(input_dim):
    """Helper function to create the TensorFlow ANN model as described in the paper."""
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_dim,)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.02)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Initialize all models for the experiment
models = {
    'DT': DecisionTreeClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Naïve Bayes': GaussianNB(),
    'SVM': SVC(kernel='rbf', gamma=0.15, probability=True, random_state=42),
    'ANN': None # ANN is handled separately in the pipeline
}

print("Classifier models defined.")

In [None]:
# ==============================================================================
# Main Experiment Pipeline
# ==============================================================================
from sklearn.base import clone # Import the clone function

def run_experiments(X_train, y_train, X_test, y_test):
    """
    Executes the full experimental pipeline as described in the paper.
    This includes:
    1. Evaluating baseline classifiers on un-clustered data.
    2. Cascading clustering with classification to enhance performance.
    3. Applying the proposed ensemble learning approach for final prediction.
    """
    results_accuracy = pd.DataFrame(index=models.keys())
    results_precision = pd.DataFrame(index=models.keys())
    
    # --- Stage 1: Classification Without Clustering ---
    print("\n--- Running Stage 1: Baseline Classifier Performance ---")
    acc_scores, prec_scores = {}, {}
    trained_models_no_cluster = {}
    for name, model_blueprint in models.items():
        # Handle the custom TensorFlow ANN separately from scikit-learn models
        if name == 'ANN':
            ann_model = create_ann_model(X_train.shape[1])
            ann_model.fit(X_train, y_train, epochs=80, batch_size=32, verbose=0)
            y_pred = (ann_model.predict(X_test) > 0.5).astype(int)
            trained_models_no_cluster[name] = ann_model
        else:
            # Use a fresh clone of the model for this stage to ensure independence
            model = clone(model_blueprint)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            trained_models_no_cluster[name] = model
        
        acc_scores[name] = accuracy_score(y_test, y_pred) * 100
        prec_scores[name] = precision_score(y_test, y_pred, zero_division=0) * 100
    results_accuracy['Accuracy without Clustering (%)'] = acc_scores.values()
    results_precision['Precision without Clustering (%)'] = prec_scores.values()
    
    # --- Stage 2: Cascading Clustering with Classification ---
    # The dictionary key 'Bisectional kMedoids' is kept for table consistency with the paper's terminology
    clustering_methods = {'kMeans': KMeans, 'kMedoids': KMedoids, 'Bisectional kMedoids': BisectingKMeans}
    trained_models_clustered = {} # Stores models trained on the final clustered data
    
    for cluster_name, cluster_class in clustering_methods.items():
        print(f"\n--- Running Stage 2: {cluster_name} + Classifiers ---")
        acc_scores, prec_scores = {}, {}
        k = find_optimal_k(X_train, cluster_class)
        cluster_model = cluster_class(n_clusters=k, random_state=42, n_init=10) if cluster_name != 'kMedoids' else cluster_class(n_clusters=k, random_state=42, method='pam')
        
        # Add cluster labels as a new feature
        X_train_c = X_train.copy()
        X_train_c['cluster'] = cluster_model.fit_predict(X_train)
        X_test_c = X_test.copy()
        X_test_c['cluster'] = cluster_model.predict(X_test)

        for name, model_blueprint in models.items():
            if name == 'ANN':
                ann_model = create_ann_model(X_train_c.shape[1])
                ann_model.fit(X_train_c, y_train, epochs=80, batch_size=32, verbose=0)
                y_pred = (ann_model.predict(X_test_c) > 0.5).astype(int)
                # Store the model trained on the paper's proposed clustering method
                if cluster_name == 'Bisectional kMedoids':
                    trained_models_clustered[name] = ann_model
            else:
                model = clone(model_blueprint)
                model.fit(X_train_c, y_train)
                y_pred = model.predict(X_test_c)
                # Store the model trained on the paper's proposed clustering method
                if cluster_name == 'Bisectional kMedoids':
                    trained_models_clustered[name] = model

            acc_scores[name] = accuracy_score(y_test, y_pred) * 100
            prec_scores[name] = precision_score(y_test, y_pred, zero_division=0) * 100
        
        col_name = f'Accuracy with {cluster_name} Clustering (%)' if cluster_name != 'Bisectional kMedoids' else 'Accuracy with Xie Benie Bisectional kMedoids (%)'
        prec_col_name = f'Precision with {cluster_name} Clustering (%)' if cluster_name != 'Bisectional kMedoids' else 'Precision with Xie Benie Bisectional kMedoids (%)'
        results_accuracy[col_name] = acc_scores.values()
        results_precision[prec_col_name] = prec_scores.values()
        
    # --- Stage 3: Proposed Ensemble Model and Final RMSE Calculation ---
    print("\n--- Running Stage 3: Proposed Ensemble Model and RMSE Calculation ---")
    
    # Calculate baseline RMSE using a standard model (SVC) for a consistent baseline
    y_pred_no_cluster = trained_models_no_cluster['SVM'].predict(X_test)
    rmse_without_clustering = np.sqrt(mean_squared_error(y_test, y_pred_no_cluster))
    
    # Perform a manual Max Voting / Hard Voting ensemble as described in the paper's methodology
    # This approach uses all five classifiers shown in the framework diagram
    k_optimal = find_optimal_k(X_train, BisectingKMeans)
    bkm = BisectingKMeans(n_clusters=k_optimal, random_state=42, n_init=10)
    X_train_final = X_train.copy()
    X_train_final['cluster'] = bkm.fit_predict(X_train)
    X_test_final = X_test.copy()
    X_test_final['cluster'] = bkm.predict(X_test)

    # Get predictions from all 5 models trained on the final clustered data
    preds_dt = trained_models_clustered['DT'].predict(X_test_final)
    preds_knn = trained_models_clustered['KNN'].predict(X_test_final)
    preds_nb = trained_models_clustered['Naïve Bayes'].predict(X_test_final)
    preds_svm = trained_models_clustered['SVM'].predict(X_test_final)
    preds_ann = (trained_models_clustered['ANN'].predict(X_test_final) > 0.5).astype(int).flatten()

    # Stack predictions and perform a manual hard vote (find the most frequent prediction for each sample)
    stacked_preds = np.vstack([preds_dt, preds_knn, preds_nb, preds_svm, preds_ann]).T
    y_pred_ensemble = np.asarray([np.bincount(row).argmax() for row in stacked_preds])

    # Calculate final metrics for the proposed model
    ensemble_acc = accuracy_score(y_test, y_pred_ensemble) * 100
    ensemble_prec = precision_score(y_test, y_pred_ensemble) * 100
    rmse_proposed_model = np.sqrt(mean_squared_error(y_test, y_pred_ensemble))
    
    # Populate the results tables
    acc_col_ensemble = 'Accuracy with Ensemble Learning(Proposed Model)'
    prec_col_ensemble = 'Precision with Ensemble Learning(Proposed Model)'
    results_accuracy[acc_col_ensemble] = [np.nan] * (len(models)-1) + [ensemble_acc]
    results_precision[prec_col_ensemble] = [np.nan] * (len(models)-1) + [ensemble_prec]
    
    rmse_results = pd.DataFrame({
        'RMSE without Clustering': [rmse_without_clustering],
        'RMSE with Ensemble Learning(Proposed Model)': [rmse_proposed_model]
    }, index=[arff_file.split('-')[1].capitalize()])

    return results_accuracy, results_precision, rmse_results

print("Main experiment pipeline function defined.")

In [None]:
# ==============================================================================
# Section 5: Results & Analysis
# ==============================================================================
try:
    final_accuracy, final_precision, final_rmse = run_experiments(X_train, y_train, X_test, y_test)

    # Reorder columns to match paper's table structure
    acc_col_order = [
        'Accuracy without Clustering (%)', 'Accuracy with kMeans Clustering (%)',
        'Accuracy with kMedoids Clustering (%)', 'Accuracy with Xie Benie Bisectional kMedoids (%)',
        'Accuracy with Ensemble Learning(Proposed Model)'
    ]
    prec_col_order = [
        'Precision without Clustering (%)', 'Precision with kMeans Clustering (%)',
        'Precision with kMedoids Clustering (%)', 'Precision with Xie Benie Bisectional kMedoids (%)',
        'Precision with Ensemble Learning(Proposed Model)'
    ]
    final_accuracy = final_accuracy.reindex(columns=acc_col_order)
    final_precision = final_precision.reindex(columns=prec_col_order)

    print("\n\n" + "="*60)
    print(f"      FINAL RESULTS FOR: {arff_file.split('.')[0].upper()} ")
    print("="*60)

    print(f"\n--- Accuracy Performance (Similar to Tables 2, 4, 6) ---")
    display(final_accuracy.style.format("{:.1f}").background_gradient(cmap='Greens', axis=1))

    print(f"\n--- Precision Performance (Similar to Tables 3, 5, 7) ---")
    display(final_precision.style.format("{:.1f}").background_gradient(cmap='Blues', axis=1))

    print(f"\n--- RMSE Value Comparison (Similar to Table 8) ---")
    display(final_rmse.style.format("{:.3f}").background_gradient(cmap='Reds', axis=1, subset=['RMSE with Ensemble Learning(Proposed Model)']))

except NameError:
     print("ERROR: Could not run experiments because the data was not loaded.")
     print("Please make sure the previous cells, especially the data loading cell, have run successfully.")