In [None]:
# email_spam_classification.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# ----------------------------------------
# 1. Load Dataset
# ----------------------------------------
df = pd.read_csv("emails.csv")  # path to your dataset
print("Initial Shape:", df.shape)
print(df.head())

# ----------------------------------------
# 2. Basic Preprocessing
# ----------------------------------------
# Check column names (usually 'text' and 'spam' or 'label')
print(df.columns)

# Rename if necessary
if 'spam' not in df.columns and 'label' in df.columns:
    df.rename(columns={'label': 'spam'}, inplace=True)

# Remove missing values
df = df.dropna(subset=['text', 'spam'])

# Convert spam labels to binary (0 = Not Spam, 1 = Spam)
df['spam'] = df['spam'].map({'ham': 0, 'spam': 1}).fillna(df['spam'])

# ----------------------------------------
# 3. Feature Extraction (Text â†’ Numeric)
# ----------------------------------------
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X = vectorizer.fit_transform(df['text'])
y = df['spam']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----------------------------------------
# 4. Model 1: K-Nearest Neighbors
# ----------------------------------------
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# ----------------------------------------
# 5. Model 2: Support Vector Machine
# ----------------------------------------
svm = SVC(kernel='linear', C=1.0)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

# ----------------------------------------
# 6. Evaluation Function
# ----------------------------------------
def evaluate_model(y_true, y_pred, model_name):
    print(f"\nðŸ“Š {model_name} Performance:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))
    
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=["Not Spam", "Spam"],
                yticklabels=["Not Spam", "Spam"])
    plt.title(f"{model_name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

# ----------------------------------------
# 7. Evaluate Both Models
# ----------------------------------------
evaluate_model(y_test, y_pred_knn, "K-Nearest Neighbors")
evaluate_model(y_test, y_pred_svm, "Support Vector Machine")

# ----------------------------------------
# 8. Comparison Summary
# ----------------------------------------
acc_knn = accuracy_score(y_test, y_pred_knn)
acc_svm = accuracy_score(y_test, y_pred_svm)

print("\nâœ… Model Comparison:")
print(f"KNN Accuracy: {acc_knn:.4f}")
print(f"SVM Accuracy: {acc_svm:.4f}")

if acc_svm > acc_knn:
    print("ðŸ‘‰ SVM performs better for spam detection.")
else:
    print("ðŸ‘‰ KNN performs better for spam detection.")
