In [6]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator

import joblib

from lime.lime_text import LimeTextExplainer
import numpy as np

import re
import nltk
import tldextract
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\djjor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
# # install tldextract if not already installed
# try:
#     import tldextract
# except ImportError:
#     import pip
#     pip.main(['install', 'tldextract'])
#     import tldextract

In [2]:
class PhishingEmailDetector:
    def __init__(self, classifier: BaseEstimator = None):
        self.text_col = 'combined_text'
        self.meta_cols = ['same_domain', 'num_links', 'num_suspicious_words']
        self.vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
        self.scaler = StandardScaler()
        # self.classifier = classifier or LogisticRegression(solver='liblinear', class_weight='balanced')
        self.classifier = classifier
        self.pipeline = None

        if self.classifier is None:
            self.classifier = LogisticRegression(solver='liblinear', class_weight='balanced')

    # ---- Text Cleaning ----
    def _clean_text(self, text):
        if pd.isnull(text):
            return ""
        text = text.lower()
        text = re.sub(r"<.*?>", " ", text)
        text = re.sub(r"http\S+", "LINKURL", text)
        # text = re.sub(r"[^a-z\s]", " ", text)
        text = re.sub(r"[^a-zA-Z0-9\s]", " ", text) # convert to lowercase
        text = re.sub(r"\d+", "NUM", text) # remove numbers
        text = re.sub(r"\s+", " ", text).strip() # remove extra spaces
        return " ".join([w for w in text.split() if w not in stop_words])

    # ---- Metadata ----
    def _extract_domain(self, email):
        if pd.isnull(email) or "@" not in email:
            return "unknown"
        return email.split('@')[-1].lower()

    def _is_same_domain(self, sender, recipient):
        return int(self._extract_domain(sender) == self._extract_domain(recipient))

    def _count_links(self, text):
        if pd.isnull(text):
            return 0
        return len(re.findall(r"http[s]?://", text))

    def _count_suspicious_words(self, text, keywords=None):
        if pd.isnull(text):
            return 0
        if keywords is None:
            keywords = ["verify", "login", "click", "update", "urgent", "password"]
        return sum(word in text.lower() for word in keywords)

    # ---- Preprocessing ----
    def preprocess_dataframe(self, dfx, debug=False):
        df = dfx.copy()
        df['clean_subject'] = df['subject'].apply(self._clean_text)
        df['clean_body'] = df['body'].apply(self._clean_text)
        df[self.text_col] = df['clean_subject'] + " " + df['clean_body']
        df['same_domain'] = df.apply(lambda x: self._is_same_domain(x['sender'], x['recipient']), axis=1)
        df['num_links'] = df['body'].apply(self._count_links)
        df['num_suspicious_words'] = df['body'].apply(self._count_suspicious_words)

        if debug:
            print("Cleaned Text:")
            print(df[self.text_col].head())
            print("Metadata:")
            print(df[self.meta_cols].head())

        return df[[self.text_col] + self.meta_cols]

    # ---- Build Pipeline ----
    def _build_pipeline(self):
        transformer = ColumnTransformer([
            ('tfidf', self.vectorizer, self.text_col),
            ('meta', self.scaler, self.meta_cols)
        ])
        return Pipeline([
            ('features', transformer),
            ('clf', self.classifier)
        ])

    # ---- Train Model ----
    def fit(self, df, labels, debug=False):
        processed = self.preprocess_dataframe(df, debug=debug)

        self.pipeline = self._build_pipeline()
        self.pipeline.fit(processed, labels)

    # ---- Predict Labels ----
    def predict(self, df, debug=False):
        processed = self.preprocess_dataframe(df, debug=debug)
        return self.pipeline.predict(processed)

    def predict_proba(self, df):
        processed = self.preprocess_dataframe(df)
        return self.pipeline.predict_proba(processed)

    # ---- Evaluate ----
    def evaluate(self, df, labels):
        preds = self.predict(df)
        print(classification_report(labels, preds))

    # ---- Get pipeline for advanced use ----
    def get_pipeline(self):
        return self.pipeline

    def explain_instance(self, df_row, num_features=10):
        """
        Explains a single email instance (df_row must be a one-row DataFrame).
        Returns explanation object with weights.
        """
        if self.pipeline is None:
            raise ValueError("You must fit the model before calling explain_instance.")

        # Prepare LimeTextExplainer
        class_names = ['Legitimate', 'Phishing']
        explainer = LimeTextExplainer(class_names=class_names)

        # Extract text from row
        email_text = self._clean_text(df_row['subject'].values[0]) + " " + self._clean_text(df_row['body'].values[0])

        # Build a prediction function that LIME can use
        def predict_proba(texts):
            # texts: list of raw texts from LIME
            temp_df = pd.DataFrame({
                'subject': [''] * len(texts),
                'body': texts,
                'sender': [df_row['sender'].values[0]] * len(texts),
                'recipient': [df_row['recipient'].values[0]] * len(texts)
            })
            return self.predict_proba(temp_df)

        # Explain
        exp = explainer.explain_instance(email_text, predict_proba, num_features=num_features)
        return exp



In [17]:
# importing the datasets

dir_path1 = "archive/CEAS_08.csv" # sender,receiver,date,subject,body,label,urls
dir_path2 = "archive/Enron.csv" # subject,body,label
dir_path3 = "archive/Ling.csv" # subject,body,label
dir_path4 = "archive/Nazario.csv" # sender,receiver,date,subject,body,urls,label
dir_path5 = "archive/Nigerian_Fraud.csv" # sender,receiver,date,subject,body,urls,label
dir_path6 = "archive/SpamAssasin.csv" # sender,receiver,date,subject,body,label,urls

df1 = pd.read_csv(dir_path1, encoding='latin-1')
df2 = pd.read_csv(dir_path2, encoding='latin-1')
df3 = pd.read_csv(dir_path3, encoding='latin-1')
df4 = pd.read_csv(dir_path4, encoding='latin-1')
df5 = pd.read_csv(dir_path5, encoding='latin-1')
df6 = pd.read_csv(dir_path6, encoding='latin-1')

# fill missing values for df2, df3 (sender = sender@sender.s, receiver = receiver@receiver.r)
# add sender and receiver columns to df2, df3
df2['sender'] = df2['subject'].apply(lambda x: 'unknown@sender.s' if pd.isnull(x) else 'unknown@sender.s')
df2['receiver'] = df2['subject'].apply(lambda x: 'unknown@receiver.r' if pd.isnull(x) else 'unknown@receiver.r')

# df2['sender'] = df2['sender'].fillna('unknown@sender.s')
# df2['receiver'] = df2['receiver'].fillna('unknown@receiver.r')

df3['sender'] = df3['subject'].apply(lambda x: 'unknown@sender.s' if pd.isnull(x) else 'unknown@sender.s')
df3['receiver'] = df3['subject'].apply(lambda x: 'unknown@receiver.r' if pd.isnull(x) else 'unknown@receiver.r')

# df3['sender'] = df3['sender'].fillna('unknown@sender.s')
# df3['receiver'] = df3['receiver'].fillna('unknown@receiver.r')

# merge the datasets
df = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True)
df = df.dropna(subset=['body', 'subject', 'sender', 'receiver', 'label'])
# rename columns for consistency
df = df.rename(columns={'sender': 'sender', 'receiver': 'recipient', 'subject': 'subject', 'body': 'body', 'label': 'label'})

In [18]:
df.head(), df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79789 entries, 0 to 82485
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sender     79789 non-null  object 
 1   recipient  79789 non-null  object 
 2   date       47290 non-null  object 
 3   subject    79789 non-null  object 
 4   body       79789 non-null  object 
 5   label      79789 non-null  int64  
 6   urls       47423 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 4.9+ MB


(                                              sender  \
 0                   Young Esposito <Young@iworld.de>   
 1                       Mok <ipline's1983@icable.ph>   
 2  Daily Top 10 <Karmandeep-opengevl@universalnet...   
 3                 Michael Parker <ivqrnai@pobox.com>   
 4  Gretchen Suggs <externalsep1@loanofficertool.com>   
 
                                         recipient  \
 0                     user4@gvc.ceas-challenge.cc   
 1                   user2.2@gvc.ceas-challenge.cc   
 2                   user2.9@gvc.ceas-challenge.cc   
 3  SpamAssassin Dev <xrh@spamassassin.apache.org>   
 4                   user2.2@gvc.ceas-challenge.cc   
 
                               date  \
 0  Tue, 05 Aug 2008 16:31:02 -0700   
 1  Tue, 05 Aug 2008 18:31:03 -0500   
 2  Tue, 05 Aug 2008 20:28:00 -1200   
 3  Tue, 05 Aug 2008 17:31:20 -0600   
 4  Tue, 05 Aug 2008 19:31:21 -0400   
 
                                              subject  \
 0                          Never agr

In [19]:
# save the dataset to a csv file
df.to_csv('phishing_dataset.csv', index=False)

In [13]:
# Train/test split
X = df.drop(columns='label')
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [16]:
# Sample data
# df = pd.DataFrame({
#     'subject': ["URGENT: Reset your password", "Meeting tomorrow"],
#     'body': ["Click here to update your credentials http://fake.site", "Let's talk at 10am."],
#     'sender': ["support@fakebank.com", "teammate@realco.com"],
#     'recipient': ["user@realco.com", "user@realco.com"],
#     'label': [1, 0]
# })

# Train detector
from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier

# detector = PhishingEmailDetector(classifier=SVC(kernel='linear', class_weight='balanced', probability=True))
detector = PhishingEmailDetector(classifier=SVC(kernel='rbf', class_weight='balanced', probability=True, C=10))

# detector = PhishingEmailDetector()
detector.fit(X_train, y_train)

# Evaluate
detector.evaluate(X_test, y_test)

# Predict on new sample
pred = detector.predict(X_test)
print("Predictions:", pred)

# calculate precision, recall, f1-score, accuracy of test data
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}, Accuracy: {accuracy:.2f}")

# Save the model using joblib
joblib.dump(detector.get_pipeline(), 'phishing_email_detector_v2.pkl')



              precision    recall  f1-score   support

           0       0.99      0.98      0.98      9728
           1       0.98      0.99      0.99     10220

    accuracy                           0.99     19948
   macro avg       0.99      0.99      0.99     19948
weighted avg       0.99      0.99      0.99     19948

Predictions: [1 0 0 ... 1 0 0]
Precision: 0.98, Recall: 0.99, F1-Score: 0.99, Accuracy: 0.99


['phishing_email_detector_v2.pkl']

In [7]:
# Load the model
loaded_model = joblib.load('phishing_email_detector.pkl')
detector = PhishingEmailDetector()
detector.pipeline = loaded_model

In [8]:
# test classifier on example data
# Sample data
df_sample_data = pd.DataFrame({
    'subject': ["URGENT: Reset your password", "Meeting tomorrow"],
    'body': ["Click here to update your credentials http://fake.site", "Let's talk at 10am."],
    'sender': ["support@fakebank.com", "teammate@realco.com"],
    'recipient': ["user@realco.com", "user@realco.com"],
    'label': [1, 0]
})

X_test_sample = df_sample_data.drop(columns='label')
y_test_sample = df_sample_data['label']

pred_sample = detector.predict(X_test_sample, debug=True)
print("Sample Predictions:", pred_sample)

single_email = X_test_sample.iloc[[0]]
print("Single Email for Explanation:")
print(single_email)

explination = detector.explain_instance(single_email, num_features=10)
print("LIME Explanation:")
print(explination.as_list())

Cleaned Text:
0    urgent reset password click update credentials...
1                      meeting tomorrow let talk NUMam
Name: combined_text, dtype: object
Metadata:
   same_domain  num_links  num_suspicious_words
0            0          1                     2
1            1          0                     0
Sample Predictions: [1 0]
Single Email for Explanation:
                       subject  \
0  URGENT: Reset your password   

                                                body                sender  \
0  Click here to update your credentials http://f...  support@fakebank.com   

         recipient  
0  user@realco.com  
LIME Explanation:
[('click', 0.14174449848925708), ('urgent', 0.08058383371880797), ('password', -0.06707510812382113), ('LINKURL', 0.019657643624652745), ('update', -0.015312960764313533), ('credentials', 0.006474735282390534), ('reset', -0.0006840374240482625)]


In [None]:
# hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

# Define the classifiers and their hyperparameters
classifiers = {
    'SVC': SVC(class_weight='balanced'),
    'RandomForest': RandomForestClassifier(class_weight='balanced'),
    'LogisticRegression': LogisticRegression(class_weight='balanced', solver='liblinear'),
    'MultinomialNB': MultinomialNB(),
    'GradientBoosting': GradientBoostingClassifier(),
    'HistGradientBoosting': HistGradientBoostingClassifier()
}

# Define the hyperparameters for each classifier

param_grid = {
    'SVC': {
        'clf__C': [0.1, 1, 10],
        'clf__kernel': ['linear', 'rbf']
    },
    'RandomForest': {
        'clf__n_estimators': [50, 100],
        'clf__max_depth': [None, 10, 20]
    },
    'LogisticRegression': {
        'clf__C': [0.1, 1, 10],
        'clf__penalty': ['l2']
    },
    'MultinomialNB': {
        # No hyperparameters to tune for MultinomialNB
    },
    'GradientBoosting': {
        'clf__n_estimators': [50, 100],
        'clf__learning_rate': [0.01, 0.1]
    },
    'HistGradientBoosting': {
        'clf__max_iter': [50, 100],
        'clf__learning_rate': [0.01, 0.1]
    }
}

best_overall_model = None
best_overall_score = 0
best_overall_name = None

def fitness_function(accuracy, precision, recall, f1):
    return (accuracy + precision + recall + f1) / 4

# Create a pipeline for each classifier using the PhishingEmailDetector class
for name, clf in classifiers.items():
    classifier = clf.__class__(class_weight='balanced', param_grid=param_grid[name])
    detector = PhishingEmailDetector(classifier=classifier)
    detector.fit(X_train, y_train)
    detector.evaluate(X_test, y_test)

    accuracy = accuracy_score(y_test, detector.predict(X_test))
    precision = precision_score(y_test, detector.predict(X_test))
    recall = recall_score(y_test, detector.predict(X_test))
    f1 = f1_score(y_test, detector.predict(X_test))

    fitness = fitness_function(accuracy, precision, recall, f1)
    print(f"Classifier: {name}, Fitness: {fitness:.2f}")
    if fitness > best_overall_score:
        best_overall_score = fitness
        best_overall_model = detector.get_pipeline()
        best_overall_name = name

print(f"Best overall model: {best_overall_name}, Score: {best_overall_score:.2f}")



In [14]:
from itertools import product
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# # progress bar
# from tqdm import tqdm_notebook

# Define classifiers
classifiers = {
    'SVC': SVC(class_weight='balanced', probability=True),
    'RandomForest': RandomForestClassifier(class_weight='balanced'),
    'LogisticRegression': LogisticRegression(class_weight='balanced', solver='liblinear', ),
    # 'MultinomialNB': MultinomialNB(),
    'GradientBoosting': GradientBoostingClassifier(),
    # 'HistGradientBoosting': HistGradientBoostingClassifier()
}

# Define hyperparameter grids
param_grid = {
    'SVC': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'RandomForest': {'n_estimators': [50, 100], 'max_depth': [None, 10, 20]},
    'LogisticRegression': {'C': [0.1, 1, 10], 'penalty': ['l2']},
    # 'MultinomialNB': {},
    'GradientBoosting': {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}
    # 'HistGradientBoosting': {'max_iter': [50, 100], 'learning_rate': [0.01, 0.1]}
}

# calc combinations of hyperparameters
param_combinations = list(product(*param_grid.values()))

# Initialize the progress bar
# from tqdm import tqdm
# tqdm.pandas(desc="Processing", unit="combination", total=len(param_combinations))

best_overall_model = None
best_overall_score = 0
best_overall_name = None

def fitness_function(accuracy, precision, recall, f1):
    """
    Custom fitness function prioritizing recall and precision.
    - Recall is crucial for phishing detection (avoiding false negatives).
    - Precision ensures fewer false positives (better user experience).
    - F1-score balances them.
    - Accuracy contributes but has the lowest weight.
    """
    return (0.2 * accuracy) + (0.3 * precision) + (0.3 * recall) + (0.2 * f1)

# Manual grid search
for name, clf in classifiers.items():
    # Generate hyperparameter combinations
    param_combinations = list(product(*param_grid.get(name, {}).values()))
    param_keys = list(param_grid.get(name, {}).keys())

    for params in param_combinations:
        param_dict = dict(zip(param_keys, params))

        # Create classifier with specific hyperparameters
        tuned_classifier = clf.__class__(**param_dict)

        # print(hasattr(tuned_classifier, "estimators_"))

        # Initialize phishing email detector
        detector = PhishingEmailDetector(classifier=tuned_classifier)
        detector.fit(X_train, y_train)
        # detector.evaluate(X_test, y_test)

        y_pred = detector.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        fitness = fitness_function(accuracy, precision, recall, f1)
        # fitness = 0.8  # Placeholder for actual fitness calculation
        print(f"Classifier: {name}, Params: {param_dict}, Fitness: {fitness:.2f}")

        filename = f"{name}_{'_'.join([f'{k}_{v}' for k, v in param_dict.items()])}.pkl"
        filename2 = f"{name}_{'_'.join([f'{k}_{v}' for k, v in param_dict.items()])}_detector.pkl"
        # Save the model using joblib
        joblib.dump(detector.get_pipeline(), filename)
        joblib.dump(detector, filename2)

        if fitness > best_overall_score:
            best_overall_score = fitness
            best_overall_model = detector
            best_overall_name = name

        # increment progress bar
        # tqdm.write(f"Classifier: {name}, Params: {param_dict}, Fitness: {fitness:.2f}")

        # update progress bar
        # tqdm.update(1)

# finalize progress bar
# tqdm.close()
# Print the best overall model and its score

print(f"Best overall model: {best_overall_name}, Score: {best_overall_score:.2f}")

Classifier: SVC, Params: {'C': 0.1, 'kernel': 'linear'}, Fitness: 0.97
Classifier: SVC, Params: {'C': 0.1, 'kernel': 'rbf'}, Fitness: 0.97
Classifier: SVC, Params: {'C': 1, 'kernel': 'linear'}, Fitness: 0.98
Classifier: SVC, Params: {'C': 1, 'kernel': 'rbf'}, Fitness: 0.98
Classifier: SVC, Params: {'C': 10, 'kernel': 'linear'}, Fitness: 0.98
Classifier: SVC, Params: {'C': 10, 'kernel': 'rbf'}, Fitness: 0.99
Classifier: RandomForest, Params: {'n_estimators': 50, 'max_depth': None}, Fitness: 0.98
Classifier: RandomForest, Params: {'n_estimators': 50, 'max_depth': 10}, Fitness: 0.92
Classifier: RandomForest, Params: {'n_estimators': 50, 'max_depth': 20}, Fitness: 0.95
Classifier: RandomForest, Params: {'n_estimators': 100, 'max_depth': None}, Fitness: 0.98
Classifier: RandomForest, Params: {'n_estimators': 100, 'max_depth': 10}, Fitness: 0.92
Classifier: RandomForest, Params: {'n_estimators': 100, 'max_depth': 20}, Fitness: 0.95
Classifier: LogisticRegression, Params: {'C': 0.1, 'penalty'