In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, 
                               StackingClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import lime
import lime.lime_tabular
import warnings
warnings.filterwarnings('ignore')

# ==================== DATA LOADING AND PREPROCESSING ====================
def load_and_preprocess_data(filepath, target_column='Call_Type'):
    """
    Load dataset and preprocess for classification
    
    Parameters:
    filepath: path to CSV file
    target_column: target variable column name
    
    Returns:
    X: feature matrix
    y: target vector
    feature_names: list of feature names
    class_names: list of class labels
    """
    df = pd.read_csv(filepath)
    
    # Select spectral features (V1-V74) and formant features (F1-F4, M1-M4)
    spectral_cols = [f'V{i}' for i in range(1, 75)]
    formant_cols = ['F1', 'F2', 'F3', 'F4', 'M1', 'M2', 'M3', 'M4']
    additional_cols = ['sprsMed', 'sprsMbw', 'sprsEqbw', 'sprsMc']
    
    feature_cols = spectral_cols + formant_cols + additional_cols
    
    # Remove rows with missing target
    df = df.dropna(subset=[target_column])
    
    # Select features that exist and have no missing values
    available_features = [col for col in feature_cols if col in df.columns]
    X = df[available_features].fillna(df[available_features].median())
    
    # Encode target variable
    le = LabelEncoder()
    y = le.fit_transform(df[target_column])
    
    return X, y, available_features, le.classes_

# ==================== A1: STACKING CLASSIFIER ====================
def create_stacking_classifier(base_models_dict, meta_model):
    """
    Create a stacking classifier with specified base models and meta model
    
    Parameters:
    base_models_dict: dictionary of (name, model) pairs for base estimators
    meta_model: final estimator (meta-model)
    
    Returns:
    stacking_clf: configured StackingClassifier
    """
    base_estimators = list(base_models_dict.items())
    
    stacking_clf = StackingClassifier(
        estimators=base_estimators,
        final_estimator=meta_model,
        cv=5,
        n_jobs=-1
    )
    
    return stacking_clf

def experiment_with_metamodels(X_train, y_train, X_test, y_test, base_models):
    """
    Experiment with different meta-models for stacking
    
    Parameters:
    X_train, y_train: training data
    X_test, y_test: test data
    base_models: dictionary of base models
    
    Returns:
    results_dict: dictionary containing results for each meta-model
    """
    meta_models = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=50, random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=50, random_state=42),
        'SVM': SVC(kernel='rbf', probability=True, random_state=42)
    }
    
    results_dict = {}
    
    for meta_name, meta_model in meta_models.items():
        stacking_clf = create_stacking_classifier(base_models, meta_model)
        stacking_clf.fit(X_train, y_train)
        
        y_pred = stacking_clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        # Cross-validation score
        cv_scores = cross_val_score(stacking_clf, X_train, y_train, cv=5, n_jobs=-1)
        
        results_dict[meta_name] = {
            'model': stacking_clf,
            'accuracy': accuracy,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'predictions': y_pred
        }
    
    return results_dict

# ==================== A2: PIPELINE IMPLEMENTATION ====================
def create_classification_pipeline(classifier):
    """
    Create a pipeline with preprocessing and classification steps
    
    Parameters:
    classifier: classification model
    
    Returns:
    pipeline: sklearn Pipeline object
    """
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', classifier)
    ])
    
    return pipeline

def execute_pipeline_experiments(X_train, y_train, X_test, y_test, base_models):
    """
    Execute pipeline with multiple classifiers
    
    Parameters:
    X_train, y_train: training data
    X_test, y_test: test data
    base_models: dictionary of base models
    
    Returns:
    pipeline_results: dictionary containing results for each pipeline
    """
    pipeline_results = {}
    
    for model_name, model in base_models.items():
        pipeline = create_classification_pipeline(model)
        pipeline.fit(X_train, y_train)
        
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        pipeline_results[model_name] = {
            'pipeline': pipeline,
            'accuracy': accuracy,
            'predictions': y_pred
        }
    
    return pipeline_results

def create_stacking_pipeline(base_models, meta_model):
    """
    Create a pipeline with stacking classifier
    
    Parameters:
    base_models: dictionary of base models
    meta_model: meta-model for stacking
    
    Returns:
    pipeline: Pipeline with StandardScaler and StackingClassifier
    """
    stacking_clf = create_stacking_classifier(base_models, meta_model)
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('stacking', stacking_clf)
    ])
    
    return pipeline

# ==================== A3: LIME EXPLAINER ====================
def explain_with_lime(pipeline, X_train, X_test, feature_names, class_names, num_samples=5):
    """
    Use LIME to explain pipeline predictions
    
    Parameters:
    pipeline: trained pipeline
    X_train: training data for LIME
    X_test: test data to explain
    feature_names: list of feature names
    class_names: list of class names
    num_samples: number of test samples to explain
    
    Returns:
    explanations: list of LIME explanation objects
    """
    # Create LIME explainer
    explainer = lime.lime_tabular.LimeTabularExplainer(
        training_data=X_train.values,
        feature_names=feature_names,
        class_names=[str(c) for c in class_names],
        mode='classification',
        random_state=42
    )
    
    explanations = []
    
    for i in range(min(num_samples, len(X_test))):
        exp = explainer.explain_instance(
            data_row=X_test.iloc[i].values,
            predict_fn=pipeline.predict_proba,
            num_features=10
        )
        explanations.append(exp)
    
    return explanations, explainer

def get_lime_explanation_summary(explanations, X_test, y_test, pipeline):
    """
    Get summary of LIME explanations
    
    Parameters:
    explanations: list of LIME explanation objects
    X_test: test data
    y_test: true labels
    pipeline: trained pipeline
    
    Returns:
    summary_list: list of dictionaries containing explanation summaries
    """
    summary_list = []
    
    for idx, exp in enumerate(explanations):
        prediction = pipeline.predict(X_test.iloc[idx:idx+1])[0]
        true_label = y_test.iloc[idx] if isinstance(y_test, pd.Series) else y_test[idx]
        
        # Get top features
        exp_list = exp.as_list()
        
        summary = {
            'sample_index': idx,
            'true_label': true_label,
            'predicted_label': prediction,
            'top_features': exp_list[:5]  # Top 5 features
        }
        
        summary_list.append(summary)
    
    return summary_list


# ==================== MAIN PROGRAM ====================
if __name__ == "__main__":
    
    # Load and preprocess data
    print("=" * 70)
    print("LOADING AND PREPROCESSING DATA")
    print("=" * 70)
    
    filepath = '20231225_dfall_obs_data_and_spectral_features_revision1_n469.csv'
    X, y, feature_names, class_names = load_and_preprocess_data(filepath, target_column='Call_Type')
    
    print(f"Dataset shape: {X.shape}")
    print(f"Number of classes: {len(class_names)}")
    print(f"Class distribution: {np.bincount(y)}")
    print(f"Classes: {class_names}")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"\nTraining set size: {X_train.shape[0]}")
    print(f"Test set size: {X_test.shape[0]}")
    
    # Define base models
    base_models = {
        'rf': RandomForestClassifier(n_estimators=100, random_state=42),
        'gb': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'dt': DecisionTreeClassifier(max_depth=10, random_state=42),
        'knn': KNeighborsClassifier(n_neighbors=5)
    }
    
    # ==================== A1: STACKING CLASSIFIER ====================
    print("\n" + "=" * 70)
    print("A1: STACKING CLASSIFIER WITH DIFFERENT META-MODELS")
    print("=" * 70)
    
    # Scale data for stacking
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    stacking_results = experiment_with_metamodels(
        X_train_scaled, y_train, X_test_scaled, y_test, base_models
    )
    
    print("\nStacking Classifier Results:")
    print("-" * 70)
    for meta_name, results in stacking_results.items():
        print(f"\nMeta-Model: {meta_name}")
        print(f"  Test Accuracy: {results['accuracy']:.4f}")
        print(f"  CV Mean Accuracy: {results['cv_mean']:.4f} (+/- {results['cv_std']:.4f})")
    
    # Find best meta-model
    best_meta = max(stacking_results.items(), key=lambda x: x[1]['accuracy'])
    print(f"\nBest Meta-Model: {best_meta[0]} with accuracy {best_meta[1]['accuracy']:.4f}")
    
    # ==================== A2: PIPELINE IMPLEMENTATION ====================
    print("\n" + "=" * 70)
    print("A2: PIPELINE WITH PREPROCESSING AND CLASSIFICATION")
    print("=" * 70)
    
    # Execute individual model pipelines
    pipeline_results = execute_pipeline_experiments(X_train, y_train, X_test, y_test, base_models)
    
    print("\nIndividual Model Pipeline Results:")
    print("-" * 70)
    for model_name, results in pipeline_results.items():
        print(f"{model_name}: Accuracy = {results['accuracy']:.4f}")
    
    # Create stacking pipeline
    print("\nCreating Stacking Pipeline...")
    best_meta_model = LogisticRegression(max_iter=1000, random_state=42)
    stacking_pipeline = create_stacking_pipeline(base_models, best_meta_model)
    stacking_pipeline.fit(X_train, y_train)
    
    y_pred_pipeline = stacking_pipeline.predict(X_test)
    pipeline_accuracy = accuracy_score(y_test, y_pred_pipeline)
    
    print(f"\nStacking Pipeline Accuracy: {pipeline_accuracy:.4f}")
    
    print("\nClassification Report for Stacking Pipeline:")
    print("-" * 70)
    print(classification_report(y_test, y_pred_pipeline, target_names=class_names))
    
    # ==================== A3: LIME EXPLAINER ====================
    print("\n" + "=" * 70)
    print("A3: LIME EXPLAINER FOR PIPELINE OUTCOMES")
    print("=" * 70)
    
    print("\nGenerating LIME explanations for test samples...")
    
    # Convert to DataFrame for LIME
    X_test_df = pd.DataFrame(X_test, columns=feature_names)
    X_train_df = pd.DataFrame(X_train, columns=feature_names)
    
    explanations, explainer = explain_with_lime(
        stacking_pipeline, X_train_df, X_test_df, feature_names, class_names, num_samples=5
    )
    
    lime_summary = get_lime_explanation_summary(
        explanations, X_test_df, y_test, stacking_pipeline
    )
    
    print("\nLIME Explanation Summary (First 5 Test Samples):")
    print("-" * 70)
    for summary in lime_summary:
        print(f"\nSample {summary['sample_index']}:")
        print(f"  True Label: {class_names[summary['true_label']]}")
        print(f"  Predicted Label: {class_names[summary['predicted_label']]}")
        print(f"  Top Contributing Features:")
        for feat, weight in summary['top_features']:
            print(f"    {feat}: {weight:.4f}")
    
    print("\n" + "=" * 70)
    print("ANALYSIS COMPLETE")
    print("=" * 70)
    
    # Summary statistics
    print("\nSummary Statistics:")
    print(f"  Total samples: {len(X)}")
    print(f"  Number of features: {len(feature_names)}")
    print(f"  Best individual pipeline: {max(pipeline_results.items(), key=lambda x: x[1]['accuracy'])[0]}")
    print(f"  Stacking pipeline improvement: {pipeline_accuracy - max([r['accuracy'] for r in pipeline_results.values()]):.4f}")

LOADING AND PREPROCESSING DATA
Dataset shape: (469, 86)
Number of classes: 1
Class distribution: [469]
Classes: ['RUM']

Training set size: 375
Test set size: 94

A1: STACKING CLASSIFIER WITH DIFFERENT META-MODELS


Traceback (most recent call last):
  File [35m"/home/easwer/.local/share/mise/installs/python/3.13.7/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /dev/shm/joblib_memmapping_folder_19445_e47b4dd5e7444a078184bdd98933c4a0_1ad2fc69bfd041c3ba6c972a5d970df1 for automatic cleanup: unknown resource type folder[0m
Traceback (most recent call last):
  File [35m"/home/easwer/.local/share/mise/installs/python/3.13.7/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /loky-19445-gfqj5ywe for automatic cleanup: unknown resource type semlock[0m
Traceback (most recent call last):
  F

ValueError: y contains 1 class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required.