# Spam Email Detection Project
This notebook implements a spam email detection system using various machine learning models. and selects the best model based on performance metrics.

Model used:
- Random Forest
- Gradient Boosting
- Naive Bayes

Evaluation Metrics:
- Accuracy
- Jaccard Score

## 1. Import Libraries

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import warnings
import joblib
from joblib.parallel import Parallel, delayed

warnings.filterwarnings("ignore")

n_jobs = max(1, int(os.cpu_count() * 0.9))

# sklearn libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    jaccard_score,
    accuracy_score,
    confusion_matrix,
    f1_score,
    classification_report,
)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB

# Text Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# be sure to download the necessary NLTK resources
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")
try:
    nltk.data.find("tokenizers/stopwords")
except LookupError:
    nltk.download("stopwords")
try:
    nltk.data.find("corpora/punkt_tab")
except LookupError:
    nltk.download("punkt_tab")

print("✅ All libraries imported successfully!")

✅ All libraries imported successfully!


[nltk_data] Downloading package stopwords to /home/aqr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/aqr/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## 2. Load Dataset and explore

In [10]:
data = pd.read_csv("spam_email_detection_dataset.csv")

print("Dataset information:")
print("-" * 50)
print(f"Shape: {data.shape}")
print(f"Columns: {data.columns.tolist()}")
print(f"Missing values: {data.isnull().sum().to_dict()}")

print("\n\nLabel distribution:")
print("-" * 50)
print(data["label"].value_counts())

print("\n\nSample data:")
print("-" * 50)
print(data.head())

Dataset information:
--------------------------------------------------
Shape: (83448, 2)
Columns: ['label', 'text']
Missing values: {'label': 0, 'text': 0}


Label distribution:
--------------------------------------------------
label
1    43910
0    39538
Name: count, dtype: int64


Sample data:
--------------------------------------------------
   label                                               text
0      1  ounce feather bowl hummingbird opec moment ala...
1      1  wulvob get your medircations online qnb ikud v...
2      0   computer connection from cnn com wednesday es...
3      1  university degree obtain a prosperous future m...
4      0  thanks for all your answers guys i know i shou...


## 3. Data Preprocessing


In [11]:
def preprocess_text(text):
    # convert to lowercase
    text = text.lower()

    # remove email addresses, links, and special patterns
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenization
    tokens = word_tokenize(text)

    # remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return " ".join(tokens)


# Test text Preprocessing function
sample_text = "Congratulations! You've won a $1,000 Walmart gift card. Click here to claim your prize now."
preprocessed_text = preprocess_text(sample_text)
print("Original Text:")
print(sample_text)
print("\nPreprocessed Text:")
print(preprocessed_text)

Original Text:
Congratulations! You've won a $1,000 Walmart gift card. Click here to claim your prize now.

Preprocessed Text:
congratul youv walmart gift card click claim prize


In [15]:
# apply preprocessing to the entire dataset
# data["cleaned_text"] = data["text"].apply(preprocess_text)
data["cleaned_text"] = Parallel(n_jobs=n_jobs)(
    delayed(preprocess_text)(text) for text in data["text"]
)

print("preprocessing completed!")
print(
    f"average text length after preprocessing: {int(data['cleaned_text'].str.len().mean())}"
)

# Remove unnecessary columns
data = data[["cleaned_text", "label"]]
# Remove empty rows
data = data[data["cleaned_text"].str.strip() != ""]

print("\nFinal Dataset shape:")
print(data.shape)

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.