# Spam Email Detection Project

This notebook implements a spam email detection system using various machine learning models. and selects the best model based on performance metrics.

Model used:

- Random Forest
- Gradient Boosting
- Naive Bayes

Evaluation Metrics:

- Accuracy
- Jaccard Score


## 1. Import Libraries


In [131]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import warnings
import joblib
from joblib.parallel import Parallel, delayed
import time
from tqdm import tqdm
from sklearn.ensemble import HistGradientBoostingClassifier

warnings.filterwarnings("ignore")

n_jobs = max(1, int(os.cpu_count() * 0.9))

# sklearn libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    jaccard_score,
    accuracy_score,
    confusion_matrix,
    f1_score,
    classification_report,
)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB

# Text Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# be sure to download the necessary NLTK resources
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")
try:
    nltk.data.find("tokenizers/stopwords")
except LookupError:
    nltk.download("stopwords")
try:
    nltk.data.find("corpora/punkt_tab")
except LookupError:
    nltk.download("punkt_tab")

print("✅ All libraries imported successfully!")

✅ All libraries imported successfully!


[nltk_data] Downloading package stopwords to /home/aqr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/aqr/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## 2. Load Dataset and explore


In [132]:
data = pd.read_csv("combined_data.csv")

print("Dataset information:")
print("-" * 50)
print(f"Shape: {data.shape}")
print(f"Columns: {data.columns.tolist()}")
print(f"Missing values: {data.isnull().sum().to_dict()}")

print("\n\nLabel distribution:")
print("-" * 50)
print(data["label"].value_counts())

print("\n\nSample data:")
print("-" * 50)
print(data.head())

Dataset information:
--------------------------------------------------
Shape: (83448, 2)
Columns: ['label', 'text']
Missing values: {'label': 0, 'text': 0}


Label distribution:
--------------------------------------------------
label
1    43910
0    39538
Name: count, dtype: int64


Sample data:
--------------------------------------------------
   label                                               text
0      1  ounce feather bowl hummingbird opec moment ala...
1      1  wulvob get your medircations online qnb ikud v...
2      0   computer connection from cnn com wednesday es...
3      1  university degree obtain a prosperous future m...
4      0  thanks for all your answers guys i know i shou...


## 3. Data Preprocessing


In [133]:
# اضافه کردن progress bar و بهینه‌سازی
def preprocess_text_optimized(text):
    # کش کردن stopwords خارج از تابع
    text = text.lower()
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()

    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in STOP_WORDS and len(word) > 2]

    # استفاده از stemmer کش شده
    tokens = [STEMMER.stem(word) for word in tokens]
    return " ".join(tokens)


# کش کردن objects
STOP_WORDS = set(stopwords.words("english"))
STEMMER = PorterStemmer()

# Test text Preprocessing function
sample_text = "Congratulations! You've won a $1,000 Walmart gift card. Click here to claim your prize now."
preprocessed_text = preprocess_text_optimized(sample_text)
print("Original Text:")
print(sample_text)
print("\nPreprocessed Text:")
print(preprocessed_text)

Original Text:
Congratulations! You've won a $1,000 Walmart gift card. Click here to claim your prize now.

Preprocessed Text:
congratul youv walmart gift card click claim prize


In [134]:
print(f"Processing {len(data)} texts...")
start_time = time.time()

# اجرا با progress tracking
data["cleaned_text"] = Parallel(n_jobs=n_jobs)(
    delayed(preprocess_text_optimized)(text)
    for text in tqdm(data["text"], desc="Preprocessing")
)

end_time = time.time()
print(f"✅ Preprocessing completed in {end_time - start_time:.2f} seconds!")

print(
    f"average text length after preprocessing: {int(data['cleaned_text'].str.len().mean())}"
)

# Remove unnecessary columns
data = data[["cleaned_text", "label"]]
# Remove empty rows
data = data[data["cleaned_text"].str.strip() != ""]

print("\nFinal Dataset shape:")
print(data.shape)

Processing 83448 texts...


Preprocessing: 100%|██████████| 83448/83448 [00:25<00:00, 3257.49it/s]


✅ Preprocessing completed in 26.27 seconds!
average text length after preprocessing: 1112

Final Dataset shape:
(83400, 2)


## 4.TF-IDF Vectorization


In [135]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=500,
    max_df=0.9,
    sublinear_tf=True,
)

# fit and transform the cleaned emails
x = tfidf_vectorizer.fit_transform(data["cleaned_text"])
y = data["label"]

In [136]:
xarr = x.toarray()
threshold = 0.0625
xarr[np.abs(xarr) < threshold] = 0

print("TF-IDF Vectorization completed.")
print(f"Feature matrix shape: {x.shape}")
print(f"Number of features: {x.shape[1]}")
print(f"Sparsity is: {(1 - np.count_nonzero(xarr) / x.size) * 100:.2f}%")

TF-IDF Vectorization completed.
Feature matrix shape: (83400, 2954)
Number of features: 2954
Sparsity is: 28.67%


## 5. Train-Test Split


In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train shape: {x_train.shape}, Test shape: {x_test.shape}")
print(f"\n\nTrain label distribution:\n{y_train.value_counts(normalize=True)}")
print(f"\n\nTest label distribution:\n{y_test.value_counts(normalize=True)}")

Train shape: (66720, 2954), Test shape: (16680, 2954)


Train label distribution:
label
1    0.526049
0    0.473951
Name: proportion, dtype: float64


Test label distribution:
label
1    0.526079
0    0.473921
Name: proportion, dtype: float64


## 6. Model Training & Evaluation


In [148]:
# Initialize models

from sklearn.linear_model import LogisticRegression


models = {
    "NaiveBayes": MultinomialNB(alpha=0.1),
    "RandomForest": RandomForestClassifier(
        n_estimators=150, max_depth=20, random_state=42, n_jobs=n_jobs
    ),
    "HistGradientBoosting": HistGradientBoostingClassifier(
        max_iter=100,
        random_state=42,
        max_depth=20,
        learning_rate=0.1,
        tol=1e-7,
    ),
    "LogisticRegression": LogisticRegression(
        random_state=42,
        solver="liblinear",
        C=1.0,
        n_jobs=n_jobs
    )
}

results = {}

for modelname, model in models.items():
    print(f"\nTraining {modelname}...")
    start_time = time.time()
    if modelname == "HistGradientBoosting":
        model.fit(x_train.toarray(), y_train.values)
        y_pred = model.predict(x_test.toarray())
    else:
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
    end_time = time.time()
    print(f"{modelname} trained in {end_time - start_time:.2f} seconds.")

    y_pred = model.predict(x_test.toarray())

    print(f"\n{modelname} Evaluation:")
    print("-" * 60)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    results[modelname] = f1_score(y_test, y_pred)



Training NaiveBayes...
NaiveBayes trained in 0.03 seconds.

NaiveBayes Evaluation:
------------------------------------------------------------

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      7905
           1       0.95      0.96      0.96      8775

    accuracy                           0.96     16680
   macro avg       0.96      0.96      0.96     16680
weighted avg       0.96      0.96      0.96     16680


Training RandomForest...
RandomForest trained in 3.60 seconds.

RandomForest Evaluation:
------------------------------------------------------------

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.88      0.94      7905
           1       0.91      1.00      0.95      8775

    accuracy                           0.94     16680
   macro avg       0.95      0.94      0.94     16680
weighted avg       0.95      0.94      0.94     16680


T

## 8. Selection of Best Model


In [150]:
best_model = models[
    pd.DataFrame({"model_name": results.keys(), "score": results.values()})
    .sort_values(by="score", ascending=False)
    .iloc[0]["model_name"]
]
best_model