# Get the dataset

Download Data set from Kaggle

In [12]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("naserabdullahalam/phishing-email-dataset")

print("Path to dataset files:", path)

Path to dataset files: /home/jetson/.cache/kagglehub/datasets/naserabdullahalam/phishing-email-dataset/versions/1


## Imports

Import Pandas and sklearn

In [13]:
import pandas as pd
import re
import joblib # Used for saving the model and vectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# --- 1 Load dataset ---

In [14]:
try:
    df = pd.read_csv(path + "/" + "CEAS_08.csv")
except FileNotFoundError:
    print("Error: Dataset file not found. Please update the file path.")
    exit()

In [15]:
# We’ll use subject, body, and label
needed = ["subject", "body", "label"]
missing = [c for c in needed if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}. Found: {df.columns.tolist()}")

# Fill any missing subject or body text with an empty string
df["subject"] = df["subject"].fillna("")
df["body"] = df["body"].fillna("")

# Combine subject and body into a single feature for the model
df["full_text"] = df["subject"] + " " + df["body"]

# Make sure label is 0/1 ints (1=spam, 0=not spam)
df["label"] = df["label"].astype(int)
print("Class counts:\n", df["label"].value_counts())
print(df[["body","label"]].head(3)) 

Class counts:
 label
1    21842
0    17312
Name: count, dtype: int64
                                                body  label
0  Buck up, your troubles caused by small dimensi...      1
1  \nUpgrade your sex and pleasures with these te...      1
2  >+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...      1


# --- 2. Preprocess Text ---

In [16]:
def preprocess_text(text):
    """Cleans text data by lowercasing and removing non-alphabetic characters."""
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation, numbers, and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("\nPreprocessing text data...")
df['cleaned_text'] = df['full_text'].apply(preprocess_text)
print("Preprocessing complete.")


Preprocessing text data...
Preprocessing complete.


# --- 3. Train/Test Split ---

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    df["cleaned_text"], df["label"],
    test_size=0.2,
    stratify=df["label"]
)
print(f"\nData split into {len(X_train)} training and {len(X_test)} testing samples.")


Data split into 31323 training and 7831 testing samples.


# --- 4. Vectorize Text using TF-IDF ---

In [18]:
# TF-IDF is effective for text classification because it weighs words by their
# importance in a document relative to the entire corpus.[5, 6]
vectorizer = TfidfVectorizer(max_features=5000) # Use top 5000 most frequent words

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test) # Use the same vectorizer for the test set
print("\nText vectorized successfully.")


Text vectorized successfully.


# --- 5. Train Logistic Regression Model ---

In [None]:
# Logistic Regression is a simple, efficient, and highly effective model for
# binary text classification tasks like spam detection.[7, 8, 9]
# model = LogisticRegression()
# model = MultinomialNB()
model = MLPClassifier(
    hidden_layer_sizes=(100,50), 
    max_iter=200,  # Increase this if the model doesn't converge
    activation='relu',
    solver='adam',
    # random_state=42,
    early_stopping=True,
    verbose=True # Shows training progress
)

print("Training the model...")
model.fit(X_train_tfidf, y_train)
print("Model training complete.")

Training the model...
Iteration 1, loss = 0.16512683
Validation score: 0.993936
Iteration 2, loss = 0.01190227
Validation score: 0.996170
Iteration 3, loss = 0.00539006
Validation score: 0.996489
Iteration 4, loss = 0.00307873
Validation score: 0.995851
Iteration 5, loss = 0.00188391
Validation score: 0.996170
Iteration 6, loss = 0.00141129
Validation score: 0.996170
Iteration 7, loss = 0.00114987
Validation score: 0.996170
Iteration 8, loss = 0.00095550
Validation score: 0.994893
Iteration 9, loss = 0.00091284
Validation score: 0.995531
Iteration 10, loss = 0.00107685
Validation score: 0.996489
Iteration 11, loss = 0.00115242
Validation score: 0.996489
Iteration 12, loss = 0.00094435
Validation score: 0.994893
Iteration 13, loss = 0.00093437
Validation score: 0.994893
Iteration 14, loss = 0.00103633
Validation score: 0.993297
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Model training complete.


# --- 6. Evaluate the Model ---

In [28]:
print("\n--- Model Evaluation ---")
y_pred = model.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['not spam', 'spam']))
# For spam, high precision is crucial to avoid misclassifying important emails (false positives).[10, 11, 12]


--- Model Evaluation ---
Accuracy: 0.9955

Confusion Matrix:
 [[3447   15]
 [  20 4349]]

Classification Report:
               precision    recall  f1-score   support

    not spam       0.99      1.00      0.99      3462
        spam       1.00      1.00      1.00      4369

    accuracy                           1.00      7831
   macro avg       1.00      1.00      1.00      7831
weighted avg       1.00      1.00      1.00      7831



# --- 7. Save the Model and Vectorizer ---

In [29]:
# We save both the model and the vectorizer so we can use them later
# to make predictions on new, unseen emails.
joblib.dump(model, 'model/spam_classifier_model.pkl')
joblib.dump(vectorizer, 'model/tfidf_vectorizer.pkl')
print("\nModel and vectorizer saved to disk as 'spam_classifier_model.pkl' and 'tfidf_vectorizer.pkl'")


Model and vectorizer saved to disk as 'spam_classifier_model.pkl' and 'tfidf_vectorizer.pkl'
