In [4]:
# Standard libraries
import re
import os
import time
import string
from pathlib import Path
from collections import Counter

# Data Science
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Scikit-learn

# Data preparation and preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer

# Models

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb

# Analysis
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [2]:
ROOT_DIR = Path("/kaggle/input/email-spam-classification-dataset")
DATA_DIR = ROOT_DIR / "combined_data.csv"
master_df = pd.read_csv(DATA_DIR)
X, y = master_df['text'], master_df['label']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      # 80% train, 20% test
    stratify=y,         # KEY: Preserve class distribution
    random_state=42     # Reproducibility
)

In [5]:
# ============================================================================
# SPAM DETECTION EDA: VALIDATING OUR HYPOTHESES
# ============================================================================

print("🔍 SPAM DETECTION EDA: VALIDATING FEATURE HYPOTHESES")
print("=" * 60)

# Basic dataset overview
print(f"Dataset shape: {master_df.shape}")
print(f"Class distribution:")
print(master_df['label'].value_counts())
print(f"Spam rate: {master_df['label'].mean():.1%}")

# Separate spam and ham for analysis
spam_emails = X_train[y_train == 1]  # Only training data
ham_emails = X_train[y_train == 0]   # Only training data

print(f"\nSpam emails: {len(spam_emails)}")
print(f"Ham emails: {len(ham_emails)}")

# ============================================================================
# HYPOTHESIS 1: PUNCTUATION PATTERNS
# ============================================================================

def analyze_punctuation_patterns(emails, label_name):
    """Analyze punctuation usage patterns"""
    print(f"\n📝 PUNCTUATION ANALYSIS - {label_name.upper()}")
    print("-" * 40)
    
    # Count various punctuation patterns
    exclamation_counts = [email.count('!') for email in emails]
    question_counts = [email.count('?') for email in emails]
    caps_ratios = [sum(1 for c in email if c.isupper()) / len(email) if len(email) > 0 else 0 for email in emails]
    
    results = {
        'avg_exclamations': np.mean(exclamation_counts),
        'max_exclamations': np.max(exclamation_counts),
        'excessive_exclamations': sum(1 for count in exclamation_counts if count > 3),
        'avg_caps_ratio': np.mean(caps_ratios),
        'high_caps_emails': sum(1 for ratio in caps_ratios if ratio > 0.3)
    }
    
    for key, value in results.items():
        print(f"  {key}: {value:.3f}")
    
    return results

# Analyze punctuation for both classes
spam_punct = analyze_punctuation_patterns(spam_emails, "spam")
ham_punct = analyze_punctuation_patterns(ham_emails, "ham")

# ============================================================================
# HYPOTHESIS 2: SUSPICIOUS URLS AND DOMAINS
# ============================================================================

def analyze_url_patterns(emails, label_name):
    """Analyze URL patterns in emails"""
    print(f"\n🌐 URL ANALYSIS - {label_name.upper()}")
    print("-" * 30)
    
    # URL pattern matching
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    shortened_patterns = [r'bit\.ly', r'tinyurl', r'goo\.gl', r't\.co']
    
    urls_found = []
    emails_with_urls = 0
    emails_with_short_urls = 0
    
    for email in emails:
        email_urls = re.findall(url_pattern, email.lower())
        if email_urls:
            emails_with_urls += 1
            urls_found.extend(email_urls)
            
        # Check for shortened URLs
        if any(re.search(pattern, email.lower()) for pattern in shortened_patterns):
            emails_with_short_urls += 1
    
    print(f"  Emails with URLs: {emails_with_urls} ({emails_with_urls/len(emails)*100:.1f}%)")
    print(f"  Emails with shortened URLs: {emails_with_short_urls} ({emails_with_short_urls/len(emails)*100:.1f}%)")
    print(f"  Total URLs found: {len(urls_found)}")
    
    # Show sample URLs (first 5)
    if urls_found:
        print(f"  Sample URLs: {urls_found[:5]}")
    
    return {
        'url_percentage': emails_with_urls/len(emails)*100,
        'short_url_percentage': emails_with_short_urls/len(emails)*100,
        'total_urls': len(urls_found)
    }

spam_urls = analyze_url_patterns(spam_emails, "spam")
ham_urls = analyze_url_patterns(ham_emails, "ham")

# ============================================================================
# HYPOTHESIS 3: CHARACTER SUBSTITUTIONS & OBFUSCATION
# ============================================================================

def analyze_substitution_patterns(emails, label_name):
    """Analyze character substitution and obfuscation"""
    print(f"\n🔀 SUBSTITUTION ANALYSIS - {label_name.upper()}")
    print("-" * 35)
    
    # Common substitution patterns
    substitution_patterns = {
        'number_subs': r'[0-9]',  # Numbers in words
        'at_symbol': r'@',        # @ for 'a'
        'dollar_sign': r'\$',     # $ for 's'
        'mixed_case': r'[a-z][A-Z]|[A-Z][a-z]'  # Mixed case within words
    }
    
    results = {}
    for pattern_name, pattern in substitution_patterns.items():
        count = sum(1 for email in emails if re.search(pattern, email))
        percentage = count / len(emails) * 100
        results[pattern_name] = percentage
        print(f"  {pattern_name}: {count} emails ({percentage:.1f}%)")
    
    # Look for specific obfuscation examples
    obfuscation_examples = []
    for email in emails[:100]:  # Check first 100 emails
        words = email.split()
        for word in words:
            if re.search(r'[0-9@\$]', word) and len(word) > 3:
                obfuscation_examples.append(word)
                if len(obfuscation_examples) >= 10:
                    break
        if len(obfuscation_examples) >= 10:
            break
    
    if obfuscation_examples:
        print(f"  Example obfuscated words: {obfuscation_examples[:5]}")
    
    return results

spam_subs = analyze_substitution_patterns(spam_emails, "spam")
ham_subs = analyze_substitution_patterns(ham_emails, "ham")

# ============================================================================
# HYPOTHESIS 4: WORD FREQUENCY ANALYSIS
# ============================================================================

def get_common_words(emails, label_name, top_n=20):
    """Get most common words in emails"""
    print(f"\n📊 WORD FREQUENCY - {label_name.upper()}")
    print("-" * 30)
    
    # Combine all emails and split into words
    all_text = ' '.join(emails).lower()
    # Remove punctuation and numbers for cleaner word analysis
    translator = str.maketrans('', '', string.punctuation + string.digits)
    clean_text = all_text.translate(translator)
    words = clean_text.split()
    
    # Count word frequencies
    word_counts = Counter(words)
    common_words = word_counts.most_common(top_n)
    
    print(f"  Top {top_n} words:")
    for i, (word, count) in enumerate(common_words, 1):
        print(f"    {i:2d}. {word:15} ({count:,} times)")
    
    return dict(common_words)

spam_words = get_common_words(spam_emails, "spam", 15)
ham_words = get_common_words(ham_emails, "ham", 15)

# ============================================================================
# HYPOTHESIS 5: EMAIL LENGTH PATTERNS
# ============================================================================

def analyze_length_patterns(emails, label_name):
    """Analyze email length characteristics"""
    print(f"\n📏 LENGTH ANALYSIS - {label_name.upper()}")
    print("-" * 25)
    
    lengths = [len(email) for email in emails]
    word_counts = [len(email.split()) for email in emails]
    
    print(f"  Average length: {np.mean(lengths):.1f} characters")
    print(f"  Median length: {np.median(lengths):.1f} characters")
    print(f"  Average words: {np.mean(word_counts):.1f} words")
    print(f"  Short emails (<50 chars): {sum(1 for l in lengths if l < 50)} ({sum(1 for l in lengths if l < 50)/len(lengths)*100:.1f}%)")
    print(f"  Long emails (>500 chars): {sum(1 for l in lengths if l > 500)} ({sum(1 for l in lengths if l > 500)/len(lengths)*100:.1f}%)")
    
    return {
        'avg_length': np.mean(lengths),
        'avg_words': np.mean(word_counts),
        'short_email_pct': sum(1 for l in lengths if l < 50)/len(lengths)*100
    }

spam_length = analyze_length_patterns(spam_emails, "spam")
ham_length = analyze_length_patterns(ham_emails, "ham")

# ============================================================================
# HYPOTHESIS VALIDATION SUMMARY
# ============================================================================

print(f"\n🎯 HYPOTHESIS VALIDATION SUMMARY")
print("=" * 50)

print(f"✅ PUNCTUATION HYPOTHESIS:")
print(f"   Spam emails use {spam_punct['avg_exclamations']:.1f}x more exclamations than ham")
print(f"   {spam_punct['excessive_exclamations']/len(spam_emails)*100:.1f}% of spam vs {ham_punct['excessive_exclamations']/len(ham_emails)*100:.1f}% of ham use excessive exclamations")

print(f"\n✅ URL HYPOTHESIS:")
print(f"   {spam_urls['url_percentage']:.1f}% of spam vs {ham_urls['url_percentage']:.1f}% of ham contain URLs")
print(f"   {spam_urls['short_url_percentage']:.1f}% of spam vs {ham_urls['short_url_percentage']:.1f}% of ham use shortened URLs")

print(f"\n✅ LENGTH HYPOTHESIS:")
print(f"   Spam emails: {spam_length['avg_length']:.0f} chars vs Ham: {ham_length['avg_length']:.0f} chars")
print(f"   Short emails: {spam_length['short_email_pct']:.1f}% spam vs {ham_length['short_email_pct']:.1f}% ham")

print(f"\n🚀 FEATURE ENGINEERING RECOMMENDATIONS:")
print(f"   - Create punctuation intensity features (!, ?, caps ratio)")
print(f"   - Binary URL presence and shortened URL detection")
print(f"   - Character substitution pattern detection")
print(f"   - Email length and word count features")
print(f"   - Spam-specific word frequency features")

🔍 SPAM DETECTION EDA: VALIDATING FEATURE HYPOTHESES
Dataset shape: (83448, 2)
Class distribution:
label
1    43910
0    39538
Name: count, dtype: int64
Spam rate: 52.6%

Spam emails: 35128
Ham emails: 31630

📝 PUNCTUATION ANALYSIS - SPAM
----------------------------------------
  avg_exclamations: 0.540
  max_exclamations: 88.000
  excessive_exclamations: 1521.000
  avg_caps_ratio: 0.000
  high_caps_emails: 0.000

📝 PUNCTUATION ANALYSIS - HAM
----------------------------------------
  avg_exclamations: 0.238
  max_exclamations: 46.000
  excessive_exclamations: 341.000
  avg_caps_ratio: 0.000
  high_caps_emails: 0.000

🌐 URL ANALYSIS - SPAM
------------------------------
  Emails with URLs: 0 (0.0%)
  Emails with shortened URLs: 8 (0.0%)
  Total URLs found: 0

🌐 URL ANALYSIS - HAM
------------------------------
  Emails with URLs: 0 (0.0%)
  Emails with shortened URLs: 16 (0.1%)
  Total URLs found: 0

🔀 SUBSTITUTION ANALYSIS - SPAM
-----------------------------------
  number_subs: 8585

In [6]:
# ============================
# BAG OF WORDS SPAM CLASSIFIER 
# ============================


def preprocess_email(text, 
                    to_lowercase=True,
                    remove_punctuation=True, 
                    replace_urls=True,
                    replace_numbers=True,
                    stemming=True):
    """Clean and preprocess email text"""
    
    text = str(text)
    
    # Convert to lowercase
    if to_lowercase:
        text = text.lower()
    
    # Replace URLs with "URL"
    if replace_urls:
        url_pattern = r'http[s]?://\S+|www\.\S+|\S+\.com\S*'
        text = re.sub(url_pattern, ' URL ', text)
    
    # Replace numbers with "NUMBER" 
    if replace_numbers:
        text = re.sub(r'\d+', ' NUMBER ', text)
    
    # Remove punctuation
    if remove_punctuation:
        text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Basic stemming (remove common suffixes) - simple approach
    if stemming:
        text = re.sub(r'ing\b', '', text)  # running -> runn
        text = re.sub(r'ed\b', '', text)   # walked -> walk
        text = re.sub(r'er\b', '', text)   # better -> bett
        text = re.sub(r'est\b', '', text)  # fastest -> fast
    
    # Clean up extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def preprocess_emails(emails, **kwargs):
    """Preprocess a series of emails"""
    return [preprocess_email(email, **kwargs) for email in emails]

def create_bow_features(X_train, X_test, 
                       binary=False,
                       max_features=5000, 
                       ngram_range=(1,1),
                       min_df=2):
    """Create Bag of Words features"""
    
    print(f"🔤 Creating Bag of Words features...")
    print(f"   Binary: {binary} | Max features: {max_features} | N-grams: {ngram_range}")
    
    vectorizer = CountVectorizer(
        binary=binary,                    # True = presence/absence, False = counts
        max_features=max_features,        # Top N most frequent words
        ngram_range=ngram_range,          # (1,1) = unigrams, (1,2) = unigrams + bigrams  
        min_df=min_df,                    # Ignore rare words
        max_df=0.95,                      # Ignore very common words
        stop_words='english',             # Remove English stop words
        token_pattern=r'\b[a-zA-Z]{2,}\b' # Only alphabetic words, min 2 chars
    )
    
    # Fit on training, transform both
    X_train_bow = vectorizer.fit_transform(X_train)
    X_test_bow = vectorizer.transform(X_test)
    
    print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
    print(f"Training shape: {X_train_bow.shape}")
    print(f"Sparsity: {1 - X_train_bow.nnz / (X_train_bow.shape[0] * X_train_bow.shape[1]):.3f}")
    
    return X_train_bow, X_test_bow, vectorizer

def evaluate_models(X_train, y_train, y_test):
    """Evaluate multiple models on BoW features - FIXED VERSION"""
    
    print(f"\n🤖 MODEL COMPARISON")
    print("=" * 40)
    
    models = {
        'LogisticRegression': LogisticRegression(random_state=42, max_iter=2000),
        'RandomForest': RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1),
        'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss', verbosity=0),
        'LightGBM': lgb.LGBMClassifier(random_state=42, verbosity=-1)
    }
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    results = []
    
    for name, model in models.items():
        print(f"\n🔥 {name}...")
        start_time = time.time()
        
        # Handle LightGBM sparse matrix issue
        if name == 'LightGBM':
            X_train_use = X_train.toarray()  # Convert to dense
        else:
            X_train_use = X_train
        
        # Cross-validation (using processed data)
        cv_scores = cross_val_score(model, X_train_use, y_train, cv=cv, scoring='roc_auc')
        cv_f1 = cross_val_score(model, X_train_use, y_train, cv=cv, scoring='f1')
        
        training_time = time.time() - start_time
        
        results.append({
            'Model': name,
            'CV_AUC': f"{cv_scores.mean():.4f} (±{cv_scores.std():.3f})",
            'CV_F1': f"{cv_f1.mean():.4f}",
            'Time': f"{training_time:.1f}s",
            'cv_auc_numeric': cv_scores.mean()  # For sorting by CV, not test
        })
        
        print(f"   CV AUC: {cv_scores.mean():.4f} (±{cv_scores.std():.3f})")
        print(f"   CV F1: {cv_f1.mean():.4f}")
    
    # Display results sorted by CV performance
    results_df = pd.DataFrame(results).sort_values('cv_auc_numeric', ascending=False)
    print(f"\n🏆 RESULTS (sorted by CV AUC):")
    print(results_df.drop('cv_auc_numeric', axis=1).to_string(index=False))
    
    return results_df

# Then separately, evaluate ONLY the best model on test set:
def final_test_evaluation(best_model_name, X_train, X_test, y_train, y_test):
    """Final evaluation on test set - use only once!"""
    print(f"\n🎯 FINAL TEST EVALUATION: {best_model_name}")
    print("=" * 40)
    
    # Recreate best model
    models = {
        'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
        'RandomForest': RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1),
        'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss', verbosity=0),
        'LightGBM': lgb.LGBMClassifier(random_state=42, verbosity=-1)
    }
    
    model = models[best_model_name]
    
    # Handle LightGBM
    if best_model_name == 'LightGBM':
        X_train_use = X_train.toarray()
        X_test_use = X_test.toarray()
    else:
        X_train_use = X_train
        X_test_use = X_test
    
    # Train and test
    model.fit(X_train_use, y_train)
    y_pred_proba = model.predict_proba(X_test_use)[:, 1]
    test_auc = roc_auc_score(y_test, y_pred_proba)
    
    print(f"Final Test AUC: {test_auc:.4f}")
    
    return test_auc

# ============================================================================
# MAIN EXECUTION
# ============================================================================

print("🚀 BAG OF WORDS SPAM CLASSIFICATION")
print("=" * 50)


print(f"Train: {len(X_train)} | Test: {len(X_test)}")

# ============================================================================
# EXPERIMENT 1: Minimal preprocessing + Binary BoW
# ============================================================================

print(f"\n" + "="*20 + " EXPERIMENT 1: MINIMAL PREPROCESSING " + "="*20)

X_train_clean1 = preprocess_emails(
    X_train, 
    to_lowercase=True,
    remove_punctuation=False,  # Keep punctuation 
    replace_urls=True,
    replace_numbers=True,
    stemming=False
)

X_test_clean1 = preprocess_emails(
    X_test,
    to_lowercase=True, 
    remove_punctuation=False,
    replace_urls=True,
    replace_numbers=True,
    stemming=False
)

# Create BoW features
X_train_bow1, X_test_bow1, vectorizer1 = create_bow_features(
    X_train_clean1, X_test_clean1,
    binary=True,              # Presence/absence
    max_features=5000,
    ngram_range=(1,1)         # Only unigrams
)

# Evaluate models
results1 = evaluate_models(X_train_bow1, y_train, y_test)

🚀 BAG OF WORDS SPAM CLASSIFICATION
Train: 66758 | Test: 16690

🔤 Creating Bag of Words features...
   Binary: True | Max features: 5000 | N-grams: (1, 1)
Vocabulary size: 5000
Training shape: (66758, 5000)
Sparsity: 0.987

🤖 MODEL COMPARISON

🔥 LogisticRegression...
   CV AUC: 0.9967 (±0.000)
   CV F1: 0.9816

🔥 RandomForest...
   CV AUC: 0.9974 (±0.000)
   CV F1: 0.9815

🔥 XGBoost...
   CV AUC: 0.9970 (±0.000)
   CV F1: 0.9779

🔥 LightGBM...
   CV AUC: 0.9971 (±0.000)
   CV F1: 0.9788

🏆 RESULTS (sorted by CV AUC):
             Model          CV_AUC  CV_F1   Time
      RandomForest 0.9974 (±0.000) 0.9815 352.0s
          LightGBM 0.9971 (±0.000) 0.9788 156.5s
           XGBoost 0.9970 (±0.000) 0.9779  15.6s
LogisticRegression 0.9967 (±0.000) 0.9816  26.7s


In [7]:
# ============================================================================
# EXPERIMENT 2: Standard preprocessing + Count BoW
# ============================================================================

print(f"\n" + "="*20 + " EXPERIMENT 2: STANDARD PREPROCESSING " + "="*20)

X_train_clean2 = preprocess_emails(
    X_train,
    to_lowercase=True,
    remove_punctuation=True,   # Remove punctuation
    replace_urls=True, 
    replace_numbers=True,
    stemming=False
)

X_test_clean2 = preprocess_emails(
    X_test,
    to_lowercase=True,
    remove_punctuation=True,
    replace_urls=True,
    replace_numbers=True, 
    stemming=False
)

# Create BoW features
X_train_bow2, X_test_bow2, vectorizer2 = create_bow_features(
    X_train_clean2, X_test_clean2,
    binary=False,             # Word counts
    max_features=5000,
    ngram_range=(1,1)
)

# Evaluate models
results2 = evaluate_models(X_train_bow2, y_train, y_test)

# ============================================================================
# EXPERIMENT 3: Heavy preprocessing + Binary BoW + Bigrams
# ============================================================================

print(f"\n" + "="*20 + " EXPERIMENT 3: HEAVY PREPROCESSING + BIGRAMS " + "="*20)

X_train_clean3 = preprocess_emails(
    X_train,
    to_lowercase=True,
    remove_punctuation=True,
    replace_urls=True,
    replace_numbers=True,
    stemming=True             # Simple stemming
)

X_test_clean3 = preprocess_emails(
    X_test,
    to_lowercase=True,
    remove_punctuation=True,
    replace_urls=True,
    replace_numbers=True,
    stemming=True
)

# Create BoW features with bigrams
X_train_bow3, X_test_bow3, vectorizer3 = create_bow_features(
    X_train_clean3, X_test_clean3,
    binary=True,              # Presence/absence
    max_features=10000,       # More features for bigrams
    ngram_range=(1,2)         # Unigrams + bigrams
)

# Evaluate models
results3 = evaluate_models(X_train_bow3, y_train, y_test)


🔤 Creating Bag of Words features...
   Binary: False | Max features: 5000 | N-grams: (1, 1)
Vocabulary size: 5000
Training shape: (66758, 5000)
Sparsity: 0.987

🤖 MODEL COMPARISON

🔥 LogisticRegression...
   CV AUC: 0.9952 (±0.001)
   CV F1: 0.9813

🔥 RandomForest...
   CV AUC: 0.9973 (±0.000)
   CV F1: 0.9819

🔥 XGBoost...
   CV AUC: 0.9971 (±0.000)
   CV F1: 0.9788

🔥 LightGBM...
   CV AUC: 0.9972 (±0.000)
   CV F1: 0.9794

🏆 RESULTS (sorted by CV AUC):
             Model          CV_AUC  CV_F1   Time
      RandomForest 0.9973 (±0.000) 0.9819 344.7s
          LightGBM 0.9972 (±0.000) 0.9794 173.1s
           XGBoost 0.9971 (±0.000) 0.9788  38.6s
LogisticRegression 0.9952 (±0.001) 0.9813 288.0s

🔤 Creating Bag of Words features...
   Binary: True | Max features: 10000 | N-grams: (1, 2)
Vocabulary size: 10000
Training shape: (66758, 10000)
Sparsity: 0.991

🤖 MODEL COMPARISON

🔥 LogisticRegression...
   CV AUC: 0.9974 (±0.000)
   CV F1: 0.9845

🔥 RandomForest...
   CV AUC: 0.9977 (±0.0

In [8]:
# ============================================================================
# SUMMARY (CORRECTED VERSION)
# ============================================================================
print(f"\n🏆 EXPERIMENT SUMMARY")
print("=" * 60)

# Find best result from each experiment
best1 = results1.iloc[0]
best2 = results2.iloc[0] 
best3 = results3.iloc[0]

# Use CV_AUC instead of Test_AUC (which doesn't exist anymore)
print(f"Experiment 1 (Minimal):     {best1['Model']} - CV AUC: {best1['CV_AUC']}")
print(f"Experiment 2 (Standard):    {best2['Model']} - CV AUC: {best2['CV_AUC']}")
print(f"Experiment 3 (Heavy+Bi):    {best3['Model']} - CV AUC: {best3['CV_AUC']}")

# Overall best using cv_auc_numeric instead of test_auc_numeric
all_results = [
    (1, best1['Model'], best1['cv_auc_numeric']),
    (2, best2['Model'], best2['cv_auc_numeric']),
    (3, best3['Model'], best3['cv_auc_numeric'])
]

overall_best = max(all_results, key=lambda x: x[2])
print(f"\n🥇 OVERALL BEST: Experiment {overall_best[0]} - {overall_best[1]} (CV AUC: {overall_best[2]:.4f})")

print(f"\n✅ BAG OF WORDS SPAM CLASSIFICATION COMPLETE!")


🏆 EXPERIMENT SUMMARY
Experiment 1 (Minimal):     RandomForest - CV AUC: 0.9974 (±0.000)
Experiment 2 (Standard):    RandomForest - CV AUC: 0.9973 (±0.000)
Experiment 3 (Heavy+Bi):    RandomForest - CV AUC: 0.9977 (±0.000)

🥇 OVERALL BEST: Experiment 3 - RandomForest (CV AUC: 0.9977)

✅ BAG OF WORDS SPAM CLASSIFICATION COMPLETE!


In [9]:
final_auc = final_test_evaluation('RandomForest', X_train_bow3, X_test_bow3, y_train, y_test)
print(f"Final untouched test set performance: {final_auc:.4f}")


🎯 FINAL TEST EVALUATION: RandomForest
Final Test AUC: 0.9979
Final untouched test set performance: 0.9979


### Retraining the best model

The winner on all experiments is the RandomForest model. The heavy preprocessing version beats the others by a fraction, so we'll go with that one. We will train the model one more time, but here's the crucial detail: **we will test this model on a completely new dataset because thre performance seems too good to be true**.

Let's be real here. Spam comes in different shapes and sizes. Ham too, actually. I think our model has just memorized things very very well. You can say I've learnt a lot from my failed Titanic experiments!! Let's see practically how this model does on a different dataset, and then we'll see how good it truly is!

*The new dataset to be used is one that somebody compiled and published on Kaggle after reading the same chapter that I'm reading right now, perfect!!!*

In [11]:
# ====================
# TRAIN CHAMPION MODEL
# ====================


print("🏆 TRAINING FINAL CHAMPION MODEL: RandomForest (Experiment 3)")
print("=" * 60)

# Recreate the winning configuration (Experiment 3)
print("\n📋 Configuration:")
print("   - Heavy preprocessing (lowercase, remove punctuation, URLs→NUMBER, numbers→NUMBER, stemming)")
print("   - Binary BoW (presence/absence)")
print("   - Bigrams (1-2 grams)")
print("   - 10,000 max features")

# Train the final model
print("\n🔥 Training RandomForest on full training set...")
champion_model = RandomForestClassifier(
    random_state=42, 
    n_estimators=100,
    n_jobs=-1
)

# Use Experiment 3 data (X_train_bow3, X_test_bow3)
print(f"Training data shape: {X_train_bow3.shape}")

# Convert sparse to dense for RandomForest
X_train_dense = X_train_bow3.toarray()
champion_model.fit(X_train_dense, y_train)

# Quick verification on our test set
X_test_dense = X_test_bow3.toarray()
test_predictions = champion_model.predict_proba(X_test_dense)[:, 1]
verification_auc = roc_auc_score(y_test, test_predictions)
print(f"\n📊 Verification AUC on original test set: {verification_auc:.4f}")

# 3. The preprocessing configuration
preprocessing_config = {
    'to_lowercase': True,
    'remove_punctuation': True,
    'replace_urls': True,
    'replace_numbers': True,
    'stemming': True  
}


print(f"\n🎯 Champion model ready for cross-dataset testing!")
print(f"   Training performance: 99.77% CV AUC")
print(f"   Test performance: {verification_auc:.4f} AUC")
print(f"\n🚀 Ready to test on new spam datasets!")

🏆 TRAINING FINAL CHAMPION MODEL: RandomForest (Experiment 3)

📋 Configuration:
   - Heavy preprocessing (lowercase, remove punctuation, URLs→NUMBER, numbers→NUMBER, stemming)
   - Binary BoW (presence/absence)
   - Bigrams (1-2 grams)
   - 10,000 max features

🔥 Training RandomForest on full training set...
Training data shape: (66758, 10000)

📊 Verification AUC on original test set: 0.9979

🎯 Champion model ready for cross-dataset testing!
   Training performance: 99.77% CV AUC
   Test performance: 0.9979 AUC

🚀 Ready to test on new spam datasets!


In [21]:
ROOT_DIR = Path("/kaggle/input/spam-or-not-spam-dataset")
DATA_DIR = ROOT_DIR / "spam_or_not_spam.csv"
new_df = pd.read_csv(DATA_DIR)
X, y_new = new_df['email'], new_df['label']
new_df.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [22]:
# Assume you have:
# 1. new_dataset_df: The new pandas DataFrame to evaluate.
# 2. preprocess_clean_function: The same function used for original cleaning.
# 3. vectorizer3: The vectorizer FITTED on the original X_train_clean3.
# 4. champion_model: Your best trained model.
# 5. y_new: The true labels from your new dataset.

# Step 1: Apply the IDENTICAL preprocessing to the new text data.
# Do not split it. You are evaluating the entire dataset.
print("➡️ Preprocessing new data...")
X_new_clean = preprocess_emails(
    new_df['email'],
    to_lowercase=True,
    remove_punctuation=True,
    replace_urls=True,
    replace_numbers=True,
    stemming=True
)

# Step 2: Use the ORIGINAL vectorizer to transform the new data. DO NOT RE-FIT.
print("➡️ Transforming new data into the model's feature space...")
X_new_bow = vectorizer3.transform(X_new_clean)
X_new_dense = X_new_bow.toarray()

# Step 3: Make predictions with your champion model.
print("➡️ Making predictions...")
new_predictions = champion_model.predict_proba(X_new_dense)[:, 1]

# Step 4: Evaluate performance.
new_auc = roc_auc_score(y_new, new_predictions)

print(f"\n✅ Generalization AUC on new dataset: {new_auc:.4f}")

➡️ Preprocessing new data...
➡️ Transforming new data into the model's feature space...
➡️ Making predictions...

✅ Generalization AUC on new dataset: 0.9460


### A little more confidence finally


Not bad. A 0.4-0.5 drop in AUC score isn't bad at all considering we're still in the mid-90s of AUC scores on an entirely new dataset. To me, it seems impossible to be confident in any model I write unless it is tested on completely new datasets drawn from completely new sources, devoid of any connection. Only after doing so, I will be confident of any "high scores" I see...