# 📧 Spam Email Detection Project

This notebook implements a spam email detection system using TF-IDF vectorization and multiple machine learning models.

## Models Used:
- RandomForest
- GradientBoosting 
- NaiveBayes

## Evaluation Metric:
- Jaccard Score
- Accuracy Score

## 1. Import Required Libraries

In [None]:
# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings('ignore')

# Sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    accuracy_score, 
    jaccard_score, 
    classification_report, 
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score
)

# Text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

print("✅ All libraries imported successfully!")

## 2. Load and Explore Dataset

In [None]:
# Load the dataset
df = pd.read_csv('email_classification_dataset.csv')

print("📊 Dataset Information:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\n📈 Label Distribution:")
print(df['label'].value_counts())
print(f"\n📧 Sample email length: {len(df.iloc[0]['email'])} characters")

# Display first few rows
df.head()

In [None]:
# Visualize label distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='label', palette='viridis')
plt.title('Distribution of Email Labels (Ham vs Spam)', fontsize=14, fontweight='bold')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

# Check for missing values
print("\n🔍 Missing Values:")
print(df.isnull().sum())

## 3. Text Preprocessing Pipeline

In [None]:
def preprocess_text(text):
    """
    Comprehensive text preprocessing function
    
    Steps:
    1. Convert to lowercase
    2. Remove special characters and numbers
    3. Remove extra whitespaces
    4. Tokenize
    5. Remove stopwords
    6. Apply stemming
    """
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove emails, URLs, and special patterns
    text = re.sub(r'\S+@\S+', '', text)  # Remove emails
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    return ' '.join(tokens)

# Test preprocessing function
sample_text = df.iloc[0]['email']
print("📧 Original text (first 200 chars):")
print(sample_text[:200])
print("\n🔧 Preprocessed text:")
preprocessed = preprocess_text(sample_text)
print(preprocessed[:200])

In [None]:
# Apply preprocessing to all emails
print("🔄 Preprocessing all emails...")
df['cleaned_email'] = df['email'].apply(preprocess_text)

# Check the results
print("✅ Preprocessing completed!")
print(f"\n📊 Average email length before preprocessing: {df['email'].str.len().mean():.2f} characters")
print(f"📊 Average email length after preprocessing: {df['cleaned_email'].str.len().mean():.2f} characters")

# Remove empty emails after preprocessing
df = df[df['cleaned_email'].str.len() > 0]
print(f"\n📈 Final dataset shape: {df.shape}")

## 4. TF-IDF Vectorization

In [None]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Keep top 5000 features
    ngram_range=(1, 2),  # Use unigrams and bigrams
    min_df=2,  # Ignore terms that appear in less than 2 documents
    max_df=0.95,  # Ignore terms that appear in more than 95% of documents
    sublinear_tf=True  # Apply log scaling
)

# Fit and transform the cleaned emails
print("🔄 Applying TF-IDF vectorization...")
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_email'])
y = df['label']

print(f"✅ TF-IDF vectorization completed!")
print(f"📊 Feature matrix shape: {X_tfidf.shape}")
print(f"📊 Number of features: {len(tfidf_vectorizer.get_feature_names_out())}")
print(f"📊 Sparsity: {(1 - X_tfidf.nnz / X_tfidf.size) * 100:.2f}%")

## 5. Train-Test Split

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # Maintain class distribution
)

print("📊 Data Split Information:")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"\n📈 Training set label distribution:")
print(y_train.value_counts())
print(f"\n📈 Test set label distribution:")
print(y_test.value_counts())

## 6. Model Training and Evaluation

In [None]:
# Initialize models
models = {
    'RandomForest': RandomForestClassifier(
        n_estimators=100,
        max_depth=20,
        random_state=42,
        n_jobs=-1
    ),
    'GradientBoosting': GradientBoostingClassifier(
        n_estimators=100,
        max_depth=6,
        random_state=42
    ),
    'NaiveBayes': MultinomialNB(alpha=1.0)
}

# Dictionary to store results
results = {}

print("🚀 Training models...\n")

for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    jaccard = jaccard_score(y_test, y_pred, pos_label='spam')  # Using 'spam' as positive class
    f1 = f1_score(y_test, y_pred, pos_label='spam')
    precision = precision_score(y_test, y_pred, pos_label='spam')
    recall = recall_score(y_test, y_pred, pos_label='spam')
    
    # Store results
    results[model_name] = {
        'accuracy': accuracy,
        'jaccard': jaccard,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'model': model
    }
    
    print(f"✅ {model_name} Results:")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   Jaccard Score: {jaccard:.4f}")
    print(f"   F1 Score: {f1:.4f}")
    print(f"   Precision: {precision:.4f}")
    print(f"   Recall: {recall:.4f}")
    print("-" * 50)

print("🎉 All models trained successfully!")

## 7. Model Comparison and Best Model Selection

In [None]:
# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[model]['accuracy'] for model in results],
    'Jaccard Score': [results[model]['jaccard'] for model in results],
    'F1 Score': [results[model]['f1'] for model in results],
    'Precision': [results[model]['precision'] for model in results],
    'Recall': [results[model]['recall'] for model in results]
})

# Sort by Jaccard Score (as requested)
comparison_df = comparison_df.sort_values('Jaccard Score', ascending=False)

print("📊 Model Comparison Results (Sorted by Jaccard Score):")
print(comparison_df.round(4))

# Find best model
best_model_name = comparison_df.iloc[0]['Model']
best_jaccard_score = comparison_df.iloc[0]['Jaccard Score']

print(f"\n🏆 Best Model: {best_model_name}")
print(f"🎯 Best Jaccard Score: {best_jaccard_score:.4f}")

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

# Accuracy comparison
axes[0, 0].bar(comparison_df['Model'], comparison_df['Accuracy'], color='skyblue')
axes[0, 0].set_title('Accuracy Scores')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].tick_params(axis='x', rotation=45)
for i, v in enumerate(comparison_df['Accuracy']):
    axes[0, 0].text(i, v + 0.01, f'{v:.3f}', ha='center')

# Jaccard Score comparison
axes[0, 1].bar(comparison_df['Model'], comparison_df['Jaccard Score'], color='lightgreen')
axes[0, 1].set_title('Jaccard Scores')
axes[0, 1].set_ylabel('Jaccard Score')
axes[0, 1].tick_params(axis='x', rotation=45)
for i, v in enumerate(comparison_df['Jaccard Score']):
    axes[0, 1].text(i, v + 0.01, f'{v:.3f}', ha='center')

# F1 Score comparison
axes[1, 0].bar(comparison_df['Model'], comparison_df['F1 Score'], color='orange')
axes[1, 0].set_title('F1 Scores')
axes[1, 0].set_ylabel('F1 Score')
axes[1, 0].tick_params(axis='x', rotation=45)
for i, v in enumerate(comparison_df['F1 Score']):
    axes[1, 0].text(i, v + 0.01, f'{v:.3f}', ha='center')

# Combined metrics
x = range(len(comparison_df))
width = 0.25
axes[1, 1].bar([i - width for i in x], comparison_df['Precision'], width, label='Precision', alpha=0.8)
axes[1, 1].bar(x, comparison_df['Recall'], width, label='Recall', alpha=0.8)
axes[1, 1].bar([i + width for i in x], comparison_df['F1 Score'], width, label='F1 Score', alpha=0.8)
axes[1, 1].set_title('Precision, Recall, and F1 Score')
axes[1, 1].set_ylabel('Score')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(comparison_df['Model'], rotation=45)
axes[1, 1].legend()

plt.tight_layout()
plt.show()

## 8. Detailed Analysis of Best Model

In [None]:
# Get best model
best_model = results[best_model_name]['model']
y_pred_best = best_model.predict(X_test)

# Detailed classification report
print(f"📋 Detailed Classification Report for {best_model_name}:")
print(classification_report(y_test, y_pred_best))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Ham', 'Spam'], 
            yticklabels=['Ham', 'Spam'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## 9. Feature Importance Analysis (for tree-based models)

In [None]:
# Feature importance analysis (only for tree-based models)
if hasattr(best_model, 'feature_importances_'):
    # Get feature names
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    # Get feature importances
    importances = best_model.feature_importances_
    
    # Create DataFrame for easy sorting
    feature_importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    # Display top 20 most important features
    print(f"🎯 Top 20 Most Important Features for {best_model_name}:")
    print(feature_importance_df.head(20))
    
    # Plot top 15 features
    plt.figure(figsize=(10, 8))
    top_features = feature_importance_df.head(15)
    sns.barplot(data=top_features, y='feature', x='importance', palette='viridis')
    plt.title(f'Top 15 Feature Importances - {best_model_name}')
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.tight_layout()
    plt.show()
else:
    print(f"⚠️ Feature importance not available for {best_model_name}")

## 10. Model Testing Function

In [None]:
def predict_email(email_text, model=best_model, vectorizer=tfidf_vectorizer):
    """
    Predict whether an email is spam or ham
    
    Args:
        email_text (str): The email text to classify
        model: Trained model to use for prediction
        vectorizer: Fitted TF-IDF vectorizer
        
    Returns:
        tuple: (prediction, probability)
    """
    # Preprocess the email
    cleaned_email = preprocess_text(email_text)
    
    # Vectorize
    email_tfidf = vectorizer.transform([cleaned_email])
    
    # Predict
    prediction = model.predict(email_tfidf)[0]
    probability = model.predict_proba(email_tfidf)[0]
    
    return prediction, probability

# Test with sample emails
test_emails = [
    "Congratulations! You've won $1000000! Click here to claim your prize now!",
    "Hi, this is a reminder about your appointment tomorrow at 2 PM. Please confirm.",
    "URGENT: Your account will be suspended unless you verify your details immediately!"
]

print("🧪 Testing the best model with sample emails:\n")
for i, email in enumerate(test_emails, 1):
    prediction, probability = predict_email(email)
    spam_prob = probability[1] if prediction == 'spam' else probability[0]
    
    print(f"Email {i}: {email[:50]}...")
    print(f"Prediction: {prediction.upper()}")
    print(f"Confidence: {spam_prob:.2%}")
    print("-" * 60)

## 11. Final Summary

In [None]:
print("📋 SPAM EMAIL DETECTION PROJECT SUMMARY")
print("=" * 50)
print(f"📊 Dataset Size: {df.shape[0]} emails")
print(f"🔧 Preprocessing: Text cleaning, tokenization, stemming")
print(f"🎯 Vectorization: TF-IDF with {X_tfidf.shape[1]} features")
print(f"🤖 Models Tested: {', '.join(models.keys())}")
print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"🎯 Jaccard Score: {best_jaccard_score:.4f}")
print(f"📈 Accuracy: {results[best_model_name]['accuracy']:.4f}")
print(f"🎪 F1 Score: {results[best_model_name]['f1']:.4f}")
print("\n" + "="*50)
print("✅ Project completed successfully!")