In [1]:
import pandas as pd

df = pd.read_csv(r"C:\Users\asmaj\Downloads\email_dataset (1).csv")
print(df.shape)
df.head()

(537, 3)


Unnamed: 0,Subject,Body,Label
0,HR Policy Updates,Please find attached the project updates and t...,legit
1,HR Policy Updates,Let me know if 3 PM works for the meeting.,legit
2,HR Policy Updates,Your quarterly review is scheduled for next week.,legit
3,Your account has been suspended,Click the link below to verify your account im...,phishing
4,Claim your reward today!,Reset your credentials to avoid security issues.,phishing


In [2]:
# Combine Subject and Body into a new column
df["EmailText"] = df["Subject"] + " " + df["Body"]

# Drop the original columns
df = df.drop(columns=["Subject", "Body"])
df = df.dropna()

df['Label'] = df['Label'].map({'legit': 0, 'phishing': 1}) 
print(df.head())

df.to_csv("combined_email_dataset.cvs", index=False)

   Label                                          EmailText
0      0  HR Policy Updates Please find attached the pro...
1      0  HR Policy Updates Let me know if 3 PM works fo...
2      0  HR Policy Updates Your quarterly review is sch...
3      1  Your account has been suspended Click the link...
4      1  Claim your reward today! Reset your credential...


In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# === Load preprocessed dataset ===
df = pd.read_csv(r"combined_email_dataset.cvs")

# === Clean the email text ===
def clean_email(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['EmailText'].apply(clean_email)

# === Extract binary features ===
def extract_features(text):
    text = text.lower()
    has_link = int(bool(re.search(r'http[s]?://|www\.', text)))
    has_money_words = int(bool(re.search(r'win|prize|gift|money|cash|reward', text)))
    has_urgent_words = int(bool(re.search(r'urgent|immediate|alert|important|verify', text)))
    has_bank_words = int(bool(re.search(r'bank|account|secure|login|password', text)))
    return np.array([has_link, has_money_words, has_urgent_words, has_bank_words])

binary_feats = np.vstack(df['EmailText'].apply(extract_features).values)

# === TF-IDF vectorization ===
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['clean_text'])

# === Combine TF-IDF and binary features ===
X = hstack([X_tfidf, csr_matrix(binary_feats)])
y = df['Label']

# === Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === Train Random Forest model ===
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# === Evaluate the model ===
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# === Save model and vectorizer ===
joblib.dump(tfidf, r'C:\Users\asmaj\Downloads\email-phishing-simulator\data\notebooks\app\model\tfidf_vectorizer.pkl')
joblib.dump(model, r'C:\Users\asmaj\Downloads\email-phishing-simulator\data\notebooks\app\model\phishing_model.pkl')


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        58
           1       1.00      1.00      1.00        50

    accuracy                           1.00       108
   macro avg       1.00      1.00      1.00       108
weighted avg       1.00      1.00      1.00       108

Accuracy: 1.0


['C:\\Users\\asmaj\\Downloads\\email-phishing-simulator\\data\\notebooks\\app\\model\\phishing_model.pkl']