In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lime
import lime.lime_tabular
import warnings
warnings.filterwarnings('ignore')

# Fix for Python 3.13 multiprocessing issues
os.environ['LOKY_MAX_CPU_COUNT'] = '1'

# ==================== A1: STACKING CLASSIFIER ====================
def create_stacking_classifier(best_rf):
    """
    Create a stacking classifier with multiple base models and meta-model.
    
    Parameters:
    best_rf: Pre-tuned Random Forest classifier
    
    Returns:
    stacking_clf: Configured StackingClassifier
    """
    # Define base estimators
    base_estimators = [
        ('dt', DecisionTreeClassifier(random_state=42)),
        ('rf', best_rf),
        ('svm', SVC(probability=True, random_state=42)),
        ('nb', GaussianNB()),
        ('mlp', MLPClassifier(random_state=42, max_iter=500)),
        ('xgb', XGBClassifier(eval_metric='mlogloss', random_state=42)),
        ('catboost', CatBoostClassifier(verbose=0, random_state=42))
    ]
    
    # Create stacking classifier with Logistic Regression as meta-model
    # Use StratifiedKFold to ensure all classes are in each fold
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    stacking_clf = StackingClassifier(
        estimators=base_estimators,
        final_estimator=LogisticRegression(max_iter=1000, random_state=42),
        cv=cv,
        stack_method='auto',
        n_jobs=1  # Avoid multiprocessing issues
    )
    
    return stacking_clf


def experiment_metamodels(X_train, y_train, X_test, y_test, best_rf):
    """
    Experiment with different meta-models for stacking.
    
    Parameters:
    X_train, y_train: Training data
    X_test, y_test: Test data
    best_rf: Pre-tuned Random Forest classifier
    
    Returns:
    results_dict: Dictionary containing performance of different meta-models
    best_stacking: Best performing stacking classifier
    """
    # Define base estimators
    base_estimators = [
        ('dt', DecisionTreeClassifier(random_state=42)),
        ('rf', best_rf),
        ('svm', SVC(probability=True, random_state=42)),
        ('nb', GaussianNB()),
        ('mlp', MLPClassifier(random_state=42, max_iter=500)),
        ('xgb', XGBClassifier(eval_metric='mlogloss', random_state=42, verbosity=0)),
        ('catboost', CatBoostClassifier(verbose=0, random_state=42))
    ]
    
    # Different meta-models to experiment
    meta_models = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'XGBoost': XGBClassifier(eval_metric='mlogloss', random_state=42, verbosity=0),
        'Decision Tree': DecisionTreeClassifier(random_state=42)
    }
    
    results_dict = {}
    best_score = 0
    best_stacking = None
    
    # Determine optimal number of splits based on smallest class
    unique, counts = np.unique(y_train, return_counts=True)
    min_class_samples = counts.min()
    n_splits = min(3, min_class_samples)  # Use at most 3 splits, or fewer if needed
    
    if n_splits < 2:
        print(f"Warning: Smallest class has only {min_class_samples} samples.")
        print("Using simple train-test split instead of cross-validation.")
        use_cv = False
    else:
        use_cv = True
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    for name, meta_model in meta_models.items():
        try:
            if use_cv:
                stacking_clf = StackingClassifier(
                    estimators=base_estimators,
                    final_estimator=meta_model,
                    cv=cv,
                    stack_method='auto',
                    n_jobs=1
                )
            else:
                # Without CV, train base estimators on full training set
                stacking_clf = StackingClassifier(
                    estimators=base_estimators,
                    final_estimator=meta_model,
                    cv='prefit',  # Assumes base estimators are already fitted
                    stack_method='auto',
                    n_jobs=1
                )
            
            # Train stacking classifier
            stacking_clf.fit(X_train, y_train)
            
            # Predictions
            y_pred = stacking_clf.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            
            # Cross-validation score (only if CV is possible)
            if use_cv:
                cv_scores = cross_val_score(stacking_clf, X_train, y_train, cv=cv, n_jobs=1)
                cv_mean = cv_scores.mean()
                cv_std = cv_scores.std()
            else:
                cv_mean = accuracy
                cv_std = 0.0
            
            results_dict[name] = {
                'accuracy': accuracy,
                'cv_mean': cv_mean,
                'cv_std': cv_std,
                'model': stacking_clf
            }
            
            # Track best model
            if accuracy > best_score:
                best_score = accuracy
                best_stacking = stacking_clf
        
        except Exception as e:
            print(f"Error with {name}: {str(e)}")
            continue
    
    return results_dict, best_stacking


# ==================== A2: PIPELINE IMPLEMENTATION ====================
def create_ml_pipeline(best_rf, y_train):
    """
    Create a pipeline with preprocessing and stacking classifier.
    
    Parameters:
    best_rf: Pre-tuned Random Forest classifier
    y_train: Training labels for CV strategy
    
    Returns:
    pipeline: Complete ML pipeline
    """
    # Define base estimators
    base_estimators = [
        ('dt', DecisionTreeClassifier(random_state=42)),
        ('rf', best_rf),
        ('svm', SVC(probability=True, random_state=42)),
        ('nb', GaussianNB()),
        ('mlp', MLPClassifier(random_state=42, max_iter=500)),
        ('xgb', XGBClassifier(eval_metric='mlogloss', random_state=42, verbosity=0)),
        ('catboost', CatBoostClassifier(verbose=0, random_state=42))
    ]
    
    # Determine optimal CV splits
    unique, counts = np.unique(y_train, return_counts=True)
    min_class_samples = counts.min()
    n_splits = min(3, min_class_samples)
    
    if n_splits >= 2:
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    else:
        cv = 2  # Fallback to 2-fold if needed
    
    # Create stacking classifier
    stacking_clf = StackingClassifier(
        estimators=base_estimators,
        final_estimator=LogisticRegression(max_iter=1000, random_state=42),
        cv=cv,
        stack_method='auto',
        n_jobs=1  # Avoid multiprocessing issues
    )
    
    # Create pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('stacking', stacking_clf)
    ])
    
    return pipeline


def evaluate_pipeline(pipeline, X_train, y_train, X_test, y_test):
    """
    Train and evaluate the pipeline.
    
    Parameters:
    pipeline: ML pipeline
    X_train, y_train: Training data
    X_test, y_test: Test data
    
    Returns:
    results: Dictionary containing evaluation metrics
    """
    # Fit pipeline
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)
    
    # Calculate metrics
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    
    # Determine CV strategy
    unique, counts = np.unique(y_train, return_counts=True)
    min_class_samples = counts.min()
    n_splits = min(3, min_class_samples)
    
    if n_splits >= 2:
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        try:
            cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, n_jobs=1)
            cv_mean = cv_scores.mean()
            cv_std = cv_scores.std()
        except Exception as e:
            print(f"Warning: Cross-validation failed: {e}")
            cv_mean = test_accuracy
            cv_std = 0.0
    else:
        print("Warning: Not enough samples per class for cross-validation")
        cv_mean = test_accuracy
        cv_std = 0.0
    
    results = {
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'cv_mean': cv_mean,
        'cv_std': cv_std,
        'classification_report': classification_report(y_test, y_pred_test),
        'confusion_matrix': confusion_matrix(y_test, y_pred_test)
    }
    
    return results


# ==================== A3: LIME EXPLAINER ====================
def explain_with_lime(pipeline, X_train, X_test, y_test, feature_names, class_names, num_samples=5):
    """
    Use LIME to explain pipeline predictions.
    
    Parameters:
    pipeline: Trained ML pipeline
    X_train: Training data for LIME
    X_test: Test data to explain
    y_test: True labels
    feature_names: List of feature names
    class_names: List of class names
    num_samples: Number of samples to explain
    
    Returns:
    explanations: List of LIME explanations
    """
    # Create LIME explainer
    explainer = lime.lime_tabular.LimeTabularExplainer(
        training_data=X_train,
        feature_names=feature_names,
        class_names=class_names,
        mode='classification',
        random_state=42
    )
    
    explanations = []
    
    # Get random samples to explain
    sample_indices = np.random.choice(len(X_test), size=min(num_samples, len(X_test)), replace=False)
    
    for idx in sample_indices:
        # Get instance
        instance = X_test[idx]
        
        # Generate explanation
        exp = explainer.explain_instance(
            data_row=instance,
            predict_fn=pipeline.predict_proba,
            num_features=10
        )
        
        # Get prediction
        prediction = pipeline.predict([instance])[0]
        true_label = y_test[idx] if hasattr(y_test, '__getitem__') else y_test.iloc[idx]
        
        explanation_data = {
            'index': idx,
            'true_label': true_label,
            'predicted_label': prediction,
            'explanation': exp,
            'top_features': exp.as_list()
        }
        
        explanations.append(explanation_data)
    
    return explanations


def get_feature_importance_from_lime(explanations):
    """
    Aggregate feature importance from multiple LIME explanations.
    
    Parameters:
    explanations: List of LIME explanations
    
    Returns:
    feature_importance: Dictionary of aggregated feature importance
    """
    feature_importance = {}
    
    for exp_data in explanations:
        for feature, weight in exp_data['top_features']:
            if feature not in feature_importance:
                feature_importance[feature] = []
            feature_importance[feature].append(abs(weight))
    
    # Calculate average importance
    avg_importance = {
        feature: np.mean(weights) 
        for feature, weights in feature_importance.items()
    }
    
    # Sort by importance
    sorted_importance = dict(sorted(avg_importance.items(), key=lambda x: x[1], reverse=True))
    
    return sorted_importance


# ==================== MAIN PROGRAM ====================
if __name__ == "__main__":
    # Load dataset
    df = pd.read_csv('20231225_dfall_obs_data_and_spectral_features_revision1_n469.csv')
    
    # Prepare data - adjust these based on your target variable
    # Example: predicting Call_Type or Context
    target_column = 'Call_Type'  # Change this to your actual target
    
    # Select features (spectral features)
    feature_cols = [col for col in df.columns if col.startswith('V') or 
                    col.startswith('F') or col.startswith('M') or 
                    col in ['sprsMed', 'sprsMbw', 'sprsEqbw', 'sprsMc']]
    
    # Drop rows with missing values in target
    df_clean = df.dropna(subset=[target_column])
    
    # Filter out classes with very few samples (less than 20 for safer CV)
    class_counts = df_clean[target_column].value_counts()
    print(f"\nOriginal class distribution:\n{class_counts}")
    
    valid_classes = class_counts[class_counts >= 20].index
    df_clean = df_clean[df_clean[target_column].isin(valid_classes)]
    
    print(f"\nFiltered to classes with >= 20 samples")
    print(f"Number of samples: {len(df_clean)}")
    print(f"Number of classes: {df_clean[target_column].nunique()}")
    print(f"Class distribution:\n{df_clean[target_column].value_counts()}")
    
    X = df_clean[feature_cols].fillna(0)
    y = df_clean[target_column]
    
    # Encode target if it's string
    if y.dtype == 'object':
        label_encoder = LabelEncoder()
        y = label_encoder.fit_transform(y)
        class_names = label_encoder.classes_.tolist()
    else:
        class_names = sorted(y.unique().tolist())
    
    # Get feature names
    feature_names = X.columns.tolist()
    
    # Split data with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Convert to numpy arrays
    X_train_np = X_train.values
    X_test_np = X_test.values
    
    print("="*70)
    print("A1: STACKING CLASSIFIER WITH MULTIPLE META-MODELS")
    print("="*70)
    
    # Assume best_rf is already trained from previous work
    best_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Experiment with different meta-models
    meta_results, best_stacking = experiment_metamodels(
        X_train_np, y_train, X_test_np, y_test, best_rf
    )
    
    print("\nMeta-Model Comparison:")
    print("-" * 70)
    for name, metrics in meta_results.items():
        print(f"\n{name}:")
        print(f"  Test Accuracy: {metrics['accuracy']:.4f}")
        print(f"  CV Mean: {metrics['cv_mean']:.4f} (+/- {metrics['cv_std']:.4f})")
    
    print("\n" + "="*70)
    print("A2: PIPELINE IMPLEMENTATION")
    print("="*70)
    
    # Create and evaluate pipeline
    pipeline = create_ml_pipeline(best_rf, y_train)
    pipeline_results = evaluate_pipeline(pipeline, X_train_np, y_train, X_test_np, y_test)
    
    print(f"\nPipeline Performance:")
    print(f"  Training Accuracy: {pipeline_results['train_accuracy']:.4f}")
    print(f"  Test Accuracy: {pipeline_results['test_accuracy']:.4f}")
    print(f"  CV Mean: {pipeline_results['cv_mean']:.4f} (+/- {pipeline_results['cv_std']:.4f})")
    
    print("\nClassification Report:")
    print(pipeline_results['classification_report'])
    
    print("\nConfusion Matrix:")
    print(pipeline_results['confusion_matrix'])
    
    print("\n" + "="*70)
    print("A3: LIME EXPLANATIONS")
    print("="*70)
    
    # Generate LIME explanations
    explanations = explain_with_lime(
        pipeline, X_train_np, X_test_np, y_test, 
        feature_names, class_names, num_samples=5
    )
    
    print(f"\nGenerated {len(explanations)} LIME explanations")
    
    for i, exp_data in enumerate(explanations, 1):
        print(f"\n--- Sample {i} (Index: {exp_data['index']}) ---")
        print(f"True Label: {exp_data['true_label']}")
        print(f"Predicted Label: {exp_data['predicted_label']}")
        print(f"Top Contributing Features:")
        for feature, weight in exp_data['top_features'][:5]:
            print(f"  {feature}: {weight:.4f}")
    
    # Aggregate feature importance
    feature_importance = get_feature_importance_from_lime(explanations)
    
    print("\n" + "="*70)
    print("AGGREGATED FEATURE IMPORTANCE (Top 10)")
    print("="*70)
    for feature, importance in list(feature_importance.items())[:10]:
        print(f"{feature}: {importance:.4f}")
    
    print("\n" + "="*70)
    print("EXPERIMENT COMPLETE")
    print("="*70)


Original class distribution:
Call_Type
RUM    469
Name: count, dtype: int64

Filtered to classes with >= 20 samples
Number of samples: 469
Number of classes: 1
Class distribution:
Call_Type
RUM    469
Name: count, dtype: int64
A1: STACKING CLASSIFIER WITH MULTIPLE META-MODELS
Error with Logistic Regression: The number of classes has to be greater than one; got 1 class
Error with Random Forest: The number of classes has to be greater than one; got 1 class
Error with XGBoost: The number of classes has to be greater than one; got 1 class
Error with Decision Tree: The number of classes has to be greater than one; got 1 class

Meta-Model Comparison:
----------------------------------------------------------------------

A2: PIPELINE IMPLEMENTATION


ValueError: The number of classes has to be greater than one; got 1 class