# DistAwareAug vs SMOTE Benchmark

This notebook compares DistAwareAug with SMOTE (Synthetic Minority Oversampling Technique) on various datasets to demonstrate the advantages of distribution-aware augmentation.

## Comparison Metrics:
- **Sample Quality**: Distribution similarity, diversity scores
- **Classification Performance**: Accuracy, F1-score, AUC-ROC
- **Computational Efficiency**: Runtime comparisons
- **Visual Analysis**: Sample distribution plots

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, make_blobs, load_breast_cancer, load_wine
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from collections import Counter
import time
import sys
import os

# Install imbalanced-learn if not available
try:
    from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
    print("✅ imbalanced-learn found")
except ImportError:
    print("❌ Installing imbalanced-learn...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "imbalanced-learn"])
    from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
    print("✅ imbalanced-learn installed")

# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('.'))))

# Import DistAwareAug
from distawareaug import DistAwareAugmentor
from distawareaug.distance import DistanceMetrics
from distawareaug.utils import check_class_balance

plt.style.use('seaborn-v0_8')
print("✅ All libraries imported successfully!")

In [None]:
# Define comparison framework
def evaluate_oversampling_method(X_train, y_train, X_test, y_test, method_name, oversampler, classifier):
    """Evaluate an oversampling method with a classifier."""
    
    # Time the oversampling
    start_time = time.time()
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
    oversample_time = time.time() - start_time
    
    # Time the training
    start_time = time.time()
    classifier.fit(X_resampled, y_resampled)
    train_time = time.time() - start_time
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    y_pred_proba = classifier.predict_proba(X_test)[:, 1] if hasattr(classifier, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # AUC-ROC (for binary classification)
    if len(np.unique(y_test)) == 2 and y_pred_proba is not None:
        auc = roc_auc_score(y_test, y_pred_proba)
    else:
        auc = None
    
    # Calculate sample statistics
    original_counts = Counter(y_train)
    resampled_counts = Counter(y_resampled)
    n_synthetic = len(y_resampled) - len(y_train)
    
    return {
        'method': method_name,
        'accuracy': accuracy,
        'f1_score': f1,
        'auc_roc': auc,
        'oversample_time': oversample_time,
        'train_time': train_time,
        'total_time': oversample_time + train_time,
        'n_synthetic': n_synthetic,
        'original_counts': original_counts,
        'resampled_counts': resampled_counts,
        'X_resampled': X_resampled,
        'y_resampled': y_resampled
    }

print("✅ Evaluation framework defined!")