# SMS Spam Detection with Explainable AI
## Training Models with LIME and SHAP Integration

This notebook demonstrates how to train SMS spam detection models with proper explainable AI capabilities using LIME and SHAP.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

# Explainable AI libraries
try:
    import lime
    import lime.lime_text
    LIME_AVAILABLE = True
    print("✅ LIME available")
except ImportError:
    LIME_AVAILABLE = False
    print("❌ LIME not available - install with: pip install lime")

try:
    import shap
    SHAP_AVAILABLE = True
    print("✅ SHAP available")
except ImportError:
    SHAP_AVAILABLE = False
    print("❌ SHAP not available - install with: pip install shap")

print(f"\n📊 Libraries loaded successfully!")

In [None]:
# Create sample SMS dataset (you can replace this with real data)
def create_sample_dataset():
    """Create a sample SMS dataset for training"""
    
    # Sample spam messages
    spam_messages = [
        "FREE! Win a $1000 gift card! Click here now!",
        "URGENT! Your account will be suspended. Call 555-SCAM immediately!",
        "Congratulations! You've won a lottery! Send your details to claim prize!",
        "Limited time offer! Get rich quick! Click this link now!",
        "WINNER! You are selected for cash prize! Reply with your bank details!",
        "Free money! No strings attached! Call now to claim your reward!",
        "ALERT: Suspicious activity detected. Verify your account immediately!",
        "You have been chosen! Win big money! Text STOP to opt out!",
        "Exclusive offer! Make money from home! Limited time only!",
        "URGENT: Your payment is overdue. Pay now to avoid penalties!"
    ] * 10  # Repeat to get more samples
    
    # Sample legitimate messages
    ham_messages = [
        "Hi! Are we still meeting for lunch tomorrow at 12pm?",
        "Thanks for the meeting today. I'll send the report by Friday.",
        "Can you pick up milk on your way home? Thanks!",
        "The conference call is scheduled for 3pm. Dial-in details attached.",
        "Happy birthday! Hope you have a wonderful day!",
        "Reminder: Doctor appointment tomorrow at 2pm.",
        "Great job on the presentation! The client was impressed.",
        "Movie starts at 7pm. See you at the theater!",
        "Flight delayed by 30 minutes. New arrival time is 8:45pm.",
        "Package delivered successfully. Thank you for your order!"
    ] * 10  # Repeat to get more samples
    
    # Create DataFrame
    messages = spam_messages + ham_messages
    labels = ['spam'] * len(spam_messages) + ['ham'] * len(ham_messages)
    
    df = pd.DataFrame({
        'message': messages,
        'label': labels
    })
    
    return df

# Load or create dataset
df = create_sample_dataset()
print(f"📊 Dataset created with {len(df)} messages")
print(f"📈 Distribution: {df['label'].value_counts().to_dict()}")
df.head()

In [None]:
# Data preprocessing and feature engineering
def preprocess_text(text):
    """Basic text preprocessing"""
    import re
    import string
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters but keep spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Preprocess messages
df['processed_message'] = df['message'].apply(preprocess_text)

# Convert labels to binary
df['label_binary'] = df['label'].map({'ham': 0, 'spam': 1})

print("✅ Text preprocessing completed")
print("\nSample processed messages:")
for i in range(3):
    print(f"Original: {df.iloc[i]['message']}")
    print(f"Processed: {df.iloc[i]['processed_message']}")
    print(f"Label: {df.iloc[i]['label']}\n")

In [None]:
# Split data and create features
X = df['processed_message']
y = df['label_binary']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"📊 Training set: {len(X_train)} messages")
print(f"📊 Test set: {len(X_test)} messages")

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),  # Include bigrams
    min_df=2,
    max_df=0.95
)

# Fit vectorizer and transform data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"✅ TF-IDF vectorization completed")
print(f"📊 Feature matrix shape: {X_train_tfidf.shape}")
print(f"📊 Vocabulary size: {len(vectorizer.vocabulary_)}")

In [None]:
# Train multiple models for comparison
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(probability=True, random_state=42)
}

# Train and evaluate models
model_results = {}

for name, model in models.items():
    print(f"\n🔄 Training {name}...")
    
    # Train model
    model.fit(X_train_tfidf, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_tfidf)
    y_pred_proba = model.predict_proba(X_test_tfidf)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train_tfidf, y_train, cv=5)
    
    model_results[name] = {
        'model': model,
        'accuracy': accuracy,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"✅ {name} - Accuracy: {accuracy:.3f}, CV: {cv_scores.mean():.3f} (±{cv_scores.std():.3f})")

# Select best model
best_model_name = max(model_results.keys(), key=lambda k: model_results[k]['accuracy'])
best_model = model_results[best_model_name]['model']

print(f"\n🏆 Best model: {best_model_name} with accuracy {model_results[best_model_name]['accuracy']:.3f}")

In [None]:
# Model interpretability - Feature importance for different model types
def get_feature_importance(model, vectorizer, model_name):
    """Extract feature importance from different model types"""
    
    feature_names = vectorizer.get_feature_names_out()
    
    if hasattr(model, 'coef_'):  # Linear models (Logistic Regression, SVM)
        # For binary classification, coef_ shape is (1, n_features)
        coefficients = model.coef_[0]
        
        # Get top positive and negative features
        feature_importance = list(zip(feature_names, coefficients))
        
        # Sort by absolute importance
        feature_importance.sort(key=lambda x: abs(x[1]), reverse=True)
        
        return feature_importance[:20]  # Top 20 features
        
    elif hasattr(model, 'feature_importances_'):  # Tree-based models
        importances = model.feature_importances_
        
        feature_importance = list(zip(feature_names, importances))
        feature_importance.sort(key=lambda x: x[1], reverse=True)
        
        return feature_importance[:20]  # Top 20 features
        
    else:
        return None

# Get feature importance for best model
feature_importance = get_feature_importance(best_model, vectorizer, best_model_name)

if feature_importance:
    print(f"\n📊 Top features for {best_model_name}:")
    for i, (feature, importance) in enumerate(feature_importance[:10], 1):
        print(f"{i:2d}. {feature:15s} : {importance:8.4f}")
        
    # Visualize feature importance
    plt.figure(figsize=(12, 8))
    features, importances = zip(*feature_importance[:15])
    
    plt.barh(range(len(features)), importances)
    plt.yticks(range(len(features)), features)
    plt.xlabel('Feature Importance')
    plt.title(f'Top 15 Features - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
else:
    print(f"\n⚠️  Feature importance not available for {best_model_name}")

In [None]:
# LIME Explanations
if LIME_AVAILABLE:
    print("\n🔍 Setting up LIME explainer...")
    
    # Create LIME explainer
    lime_explainer = lime.lime_text.LimeTextExplainer(
        class_names=['ham', 'spam'],
        feature_selection='auto',
        verbose=False
    )
    
    # Create prediction function for LIME
    def predict_fn(texts):
        vectors = vectorizer.transform(texts)
        return best_model.predict_proba(vectors)
    
    # Test LIME on sample messages
    test_messages = [
        "FREE! Win a $1000 gift card! Click here now!",
        "Hi! Are we still meeting for lunch tomorrow?"
    ]
    
    for i, message in enumerate(test_messages):
        print(f"\n📝 LIME Analysis {i+1}: {message[:50]}...")
        
        # Generate LIME explanation
        explanation = lime_explainer.explain_instance(
            message,
            predict_fn,
            num_features=10,
            labels=[0, 1]
        )
        
        # Get prediction
        prediction_proba = predict_fn([message])[0]
        predicted_class = 'spam' if prediction_proba[1] > 0.5 else 'ham'
        confidence = max(prediction_proba)
        
        print(f"🎯 Prediction: {predicted_class} (confidence: {confidence:.3f})")
        print(f"📊 Probabilities: ham={prediction_proba[0]:.3f}, spam={prediction_proba[1]:.3f}")
        
        # Show top features
        print("🔍 LIME Feature Analysis:")
        for feature, importance in explanation.as_list():
            direction = "→ SPAM" if importance > 0 else "→ HAM"
            print(f"   {feature:15s}: {importance:7.3f} {direction}")
        
        # Save explanation as HTML (optional)
        explanation.save_to_file(f'lime_explanation_{i+1}.html')
        print(f"💾 Saved explanation to lime_explanation_{i+1}.html")
        
else:
    print("\n⚠️  LIME not available - install with: pip install lime")

In [None]:
# SHAP Explanations
if SHAP_AVAILABLE:
    print("\n🔍 Setting up SHAP explainer...")
    
    try:
        # Create appropriate SHAP explainer based on model type
        if hasattr(best_model, 'coef_'):  # Linear models
            # For linear models, use LinearExplainer
            shap_explainer = shap.LinearExplainer(
                best_model, 
                X_train_tfidf,
                feature_perturbation="interventional"
            )
            explainer_type = "Linear"
            
        elif hasattr(best_model, 'feature_importances_'):  # Tree-based models
            # For tree models, use TreeExplainer
            shap_explainer = shap.TreeExplainer(best_model)
            explainer_type = "Tree"
            
        else:
            # For other models, use KernelExplainer (slower but works with any model)
            def model_predict(X):
                return best_model.predict_proba(X)[:, 1]  # Return spam probability
            
            # Use a subset of training data as background
            background = shap.sample(X_train_tfidf, 100)
            shap_explainer = shap.KernelExplainer(model_predict, background)
            explainer_type = "Kernel"
        
        print(f"✅ SHAP {explainer_type} explainer initialized")
        
        # Test SHAP on sample messages
        test_indices = [0, 1]  # First two test samples
        
        for idx in test_indices:
            message = X_test.iloc[idx]
            true_label = 'spam' if y_test.iloc[idx] == 1 else 'ham'
            
            print(f"\n📝 SHAP Analysis: {message[:50]}...")
            print(f"🏷️  True label: {true_label}")
            
            # Transform message
            message_tfidf = vectorizer.transform([message])
            
            # Get prediction
            prediction_proba = best_model.predict_proba(message_tfidf)[0]
            predicted_class = 'spam' if prediction_proba[1] > 0.5 else 'ham'
            confidence = max(prediction_proba)
            
            print(f"🎯 Prediction: {predicted_class} (confidence: {confidence:.3f})")
            print(f"📊 Probabilities: ham={prediction_proba[0]:.3f}, spam={prediction_proba[1]:.3f}")
            
            # Generate SHAP values
            if explainer_type == "Kernel":
                shap_values = shap_explainer.shap_values(message_tfidf, nsamples=100)
            else:
                shap_values = shap_explainer.shap_values(message_tfidf)
                
            # For binary classification, get spam class values
            if isinstance(shap_values, list):
                shap_values = shap_values[1]  # Spam class
            elif len(shap_values.shape) > 1 and shap_values.shape[1] > 1:
                shap_values = shap_values[:, 1]  # Spam class
            
            # Get feature names and values
            feature_names = vectorizer.get_feature_names_out()
            
            # Get top contributing features
            if len(shap_values.shape) > 1:
                feature_contributions = list(zip(feature_names, shap_values[0]))
            else:
                feature_contributions = list(zip(feature_names, shap_values))
                
            # Sort by absolute contribution
            feature_contributions.sort(key=lambda x: abs(x[1]), reverse=True)
            
            print("🔍 SHAP Feature Analysis (Top 10):")
            for feature, contribution in feature_contributions[:10]:
                if abs(contribution) > 0.001:  # Only show meaningful contributions
                    direction = "→ SPAM" if contribution > 0 else "→ HAM"
                    print(f"   {feature:15s}: {contribution:7.3f} {direction}")
        
        # Create SHAP summary plot
        print("\n📊 Creating SHAP summary plot...")
        
        # Get SHAP values for a subset of test data
        test_subset = X_test_tfidf[:10]  # First 10 test samples
        
        if explainer_type == "Kernel":
            shap_values_subset = shap_explainer.shap_values(test_subset, nsamples=50)
        else:
            shap_values_subset = shap_explainer.shap_values(test_subset)
            
        # Handle different SHAP value formats
        if isinstance(shap_values_subset, list):
            shap_values_subset = shap_values_subset[1]  # Spam class
        elif len(shap_values_subset.shape) > 2:
            shap_values_subset = shap_values_subset[:, :, 1]  # Spam class
            
        # Create summary plot
        plt.figure(figsize=(12, 8))
        shap.summary_plot(
            shap_values_subset, 
            test_subset, 
            feature_names=vectorizer.get_feature_names_out(),
            max_display=20,
            show=False
        )
        plt.title('SHAP Feature Importance Summary')
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"⚠️  SHAP analysis error: {e}")
        print("This might be due to model compatibility or data format issues.")
        
else:
    print("\n⚠️  SHAP not available - install with: pip install shap")

In [None]:
# Save the trained model and vectorizer
import os

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save model and vectorizer
model_path = f'../models/spam_model_{best_model_name.lower().replace(" ", "_")}.joblib'
vectorizer_path = '../models/tfidf_vectorizer.joblib'
metadata_path = '../models/model_metadata.json'

# Save model
joblib.dump(best_model, model_path)
print(f"💾 Model saved to: {model_path}")

# Save vectorizer
joblib.dump(vectorizer, vectorizer_path)
print(f"💾 Vectorizer saved to: {vectorizer_path}")

# Save metadata
metadata = {
    'model_name': best_model_name,
    'model_type': type(best_model).__name__,
    'accuracy': float(model_results[best_model_name]['accuracy']),
    'cv_mean': float(model_results[best_model_name]['cv_mean']),
    'cv_std': float(model_results[best_model_name]['cv_std']),
    'feature_count': len(vectorizer.vocabulary_),
    'training_samples': len(X_train),
    'test_samples': len(X_test),
    'lime_available': LIME_AVAILABLE,
    'shap_available': SHAP_AVAILABLE,
    'explainable_ai': True
}

with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)
    
print(f"💾 Metadata saved to: {metadata_path}")
print(f"\n✅ Model training and explainable AI setup complete!")
print(f"🎯 Best model: {best_model_name} with {metadata['accuracy']:.3f} accuracy")
print(f"🔍 Explainable AI: LIME={LIME_AVAILABLE}, SHAP={SHAP_AVAILABLE}")