In [4]:
# phishing_pipeline.py  (run in your environment)
import os
import re
import joblib
import pandas as pd
import numpy as np

# ML tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import resample

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# -------------------------
# Config / paths
# -------------------------
DATA_PATH = "./data/phishing_emails.csv"   # <- change to actual file from Kaggle
MODEL_OUT = "./best_phishing_model.joblib"
VECT_OUT = "./tfidf_vectorizer.joblib"

# -------------------------
# 1) Load dataset
# -------------------------
DATA_PATH = "phishing_email.csv"
#df = pd.read_csv('email.csv', encoding= "ISO-8859-1" )
df = pd.read_csv(DATA_PATH, encoding='utf-8', low_memory=False)

print("Columns:", df.columns.tolist())
# find columns with text/labels, adjust below:
# — assume label column might be named 'label' or 'Category' or 'class'
# Detect columns
label_col_candidates = ['label', 'Label', 'Category', 'category', 'class', 'target']
text_col_candidates  = ['text', 'message', 'Message', 'email', 'body', 'content', 'text_combined']

label_col = next((c for c in label_col_candidates if c in df.columns), None)
text_col  = next((c for c in text_col_candidates  if c in df.columns), None)

if not label_col or not text_col:
    raise RuntimeError(f"Could not find label/text column. Found: label={label_col}, text={text_col}")

# Keep and rename
df = df[[text_col, label_col]].rename(columns={
    text_col: 'text',
    label_col: 'label'
}).copy()
print(' Test 1 Run OK')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Columns: ['text_combined', 'label']
 Test 1 Run OK


In [5]:
# -------------------------
# 2) Clean labels robustly
# -------------------------

# convert everything to lowercase strings
df['label'] = df['label'].astype(str).str.strip().str.lower()

# numeric datasets (like yours)
df['label'] = df['label'].replace({
    '0': 'legit',
    '1': 'phishing'
})

# map typical variants
label_map = {
    'phishing': 'phishing',
    'phish': 'phishing',
    'spam': 'phishing',
    'scam': 'phishing',
    'ham': 'legit',
    'legit': 'legit',
    'legitimate': 'legit',
    'benign': 'legit'
}

df['label'] = df['label'].apply(lambda s: label_map.get(s, 'unknown'))

# keep only valid rows
df = df[df['label'].isin(['phishing','legit'])].copy()

# numeric target
df['y'] = df['label'].map({'legit': 0, 'phishing': 1})

print("Label counts:\n", df['y'].value_counts())
print(' Test 1 Run OK')


Label counts:
 y
1    42891
0    39595
Name: count, dtype: int64
 Test 1 Run OK


In [6]:
# -------------------------
# 3) Text preprocessing (keep phishing keywords)
# -------------------------
wn = WordNetLemmatizer()
builtin_stop = set(stopwords.words('english'))

# remove these from stopword removal because they're useful for phishing detection
keep_tokens = {'account','password','verify','suspend','suspended','login','bank','click','link','update','confirm','security'}
# final stoplist = builtin_stop minus keep_tokens
stoplist = builtin_stop - keep_tokens

def preprocess_text(text):
    if not isinstance(text, str): 
        return ''
    text = text.lower()
    # optional: normalize unicode
    text = re.sub(r'https?://\S+',' ', text)   # remove URLs (or keep them if you want to extract domain features)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)   # keep alphanumerics (numbers can help)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stoplist]
    tokens = [wn.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(preprocess_text)
print(' Test 1 Run OK')


 Test 1 Run OK


In [7]:
# -------------------------
# 4) Balance classes (upsample phishing if minority)
# -------------------------
print("Before balancing:", df['y'].value_counts())
n_phish = df['y'].sum()
n_legit = len(df) - n_phish

if n_phish == 0:
    raise RuntimeError("No phishing samples found — add phishing data!")

if n_phish < n_legit:
    phishing = df[df['y']==1]
    legit = df[df['y']==0]
    phishing_upsampled = resample(phishing, replace=True, n_samples=len(legit), random_state=42)
    df_bal = pd.concat([legit, phishing_upsampled]).sample(frac=1, random_state=42).reset_index(drop=True)
else:
    df_bal = df.copy()

print("After balancing:", df_bal['y'].value_counts())
print(' Test 1 Run OK')
# -------------------------


Before balancing: y
1    42891
0    39595
Name: count, dtype: int64
After balancing: y
1    42891
0    39595
Name: count, dtype: int64
 Test 1 Run OK


In [8]:
# 5) Train/test split and TF-IDF
# -------------------------
X = df_bal['clean_text']
y = df_bal['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

# TF-IDF: use 1-3 grams (unigrams, bigrams, trigrams)
tfidf = TfidfVectorizer(ngram_range=(1,3), lowercase=True, max_features=50000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)
print(' Test 1 Run OK')


 Test 1 Run OK


In [12]:
# -------------------------
# 6) Train models and evaluate
# -------------------------
'''models = {
    'LogisticRegression': LogisticRegression(max_iter=2000, class_weight='balanced', solver='liblinear'),
    'NaiveBayes': MultinomialNB(),
    'SVC': SVC(probability=True, class_weight='balanced', kernel='linear'),
    'RandomForest': RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)
} '''
models = {
    'LogisticRegression': LogisticRegression(max_iter=2000, class_weight='balanced', solver='liblinear'),
    'NaiveBayes': MultinomialNB(),
}

results = {}
for name, m in models.items():
    m.fit(X_train_tfidf, y_train)
    preds = m.predict(X_test_tfidf)
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds, digits=4))
    cm = confusion_matrix(y_test, preds)
    print("Confusion matrix:\n", cm)
    results[name] = m

# choose best by F1 or accuracy (example use LogisticRegression)
best_name = max(models.keys(), key=lambda n: accuracy_score(y_test, models[n].predict(X_test_tfidf)))
best_model = models[best_name]
print("Best:", best_name)

# Save vectorizer and model
joblib.dump(tfidf, VECT_OUT)
joblib.dump(best_model, MODEL_OUT)
print("Saved:", VECT_OUT, MODEL_OUT)
print(' Test 1- Run OK')



=== LogisticRegression ===
Accuracy: 0.986470759383183
              precision    recall  f1-score   support

           0     0.9870    0.9847    0.9859      9899
           1     0.9859    0.9881    0.9870     10723

    accuracy                         0.9865     20622
   macro avg     0.9865    0.9864    0.9864     20622
weighted avg     0.9865    0.9865    0.9865     20622

Confusion matrix:
 [[ 9748   151]
 [  128 10595]]

=== NaiveBayes ===
Accuracy: 0.9661526525070313
              precision    recall  f1-score   support

           0     0.9403    0.9925    0.9657      9899
           1     0.9927    0.9418    0.9666     10723

    accuracy                         0.9662     20622
   macro avg     0.9665    0.9672    0.9661     20622
weighted avg     0.9676    0.9662    0.9662     20622

Confusion matrix:
 [[ 9825    74]
 [  624 10099]]
Best: LogisticRegression
Saved: ./tfidf_vectorizer.joblib ./best_phishing_model.joblib
 Test 1- Run OK


In [16]:
# -------------------------
# 7) Prediction helper
# -------------------------
def predict_email(text, model=best_model, vectorizer=tfidf):
    cleaned = preprocess_text(text)
    vector = vectorizer.transform([cleaned])
    pred = model.predict(vector)[0]
    prob = model.predict_proba(vector)[0] if hasattr(model, "predict_proba") else None
    return pred, prob
'''
# quick test
msg = "Your account will be suspended, verify your password now."
pred, prob = predict_email(msg)
label_map = {0: "legit", 1: "phishing"}
print("Prediction:", label_map[pred])
print("Probabilities:", prob)
'''
import tkinter as tk
from tkinter import messagebox

def run_gui():
    def diagnose():
        text = text_entry.get("1.0", tk.END).strip()
        pred, prob = predict_email(text)
        label_map = {0: "legit", 1: "phishing"}
        label = label_map[pred]
        probability = prob[pred]

        messagebox.showinfo("Diagnosis", f"{label.upper()} (prob: {probability:.4f})")

    window = tk.Tk()
    window.title("Phishing Message Detector")

    tk.Label(window, text="Enter message:").pack()

    text_entry = tk.Text(window, height=6, width=60)
    text_entry.pack()

    tk.Button(window, text="Check", command=diagnose).pack()

    window.mainloop()

run_gui()


print(' Test 7 Run OK')

 Test 7 Run OK
