In [1]:
import pandas as pd
import numpy as np
import re
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# === Load Dataset ===
df = pd.read_csv("../data/email_dataset.csv")

# === Combine Subject and Body ===
df["EmailText"] = df["Subject"].fillna('') + " " + df["Body"].fillna('')
df.drop(columns=["Subject", "Body"], inplace=True)
df.dropna(inplace=True)

# === Encode Labels ===
df['Label'] = df['Label'].map({'legit': 0, 'phishing': 1})

# === Save Combined Dataset ===
df.to_csv("../data/combined_email_dataset.csv", index=False)

# === Clean Text Function ===
def clean_email(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['EmailText'].apply(clean_email)

# === Extract Binary Features ===
def extract_features(text):
    text = text.lower()
    has_link = int(bool(re.search(r'http[s]?://|www\.', text)))
    has_money_words = int(bool(re.search(r'win|prize|gift|money|cash|reward', text)))
    has_urgent_words = int(bool(re.search(r'urgent|immediate|alert|important|verify', text)))
    has_bank_words = int(bool(re.search(r'bank|account|secure|login|password', text)))
    return np.array([has_link, has_money_words, has_urgent_words, has_bank_words])

binary_feats = np.vstack(df['EmailText'].apply(extract_features).values)

# === TF-IDF ===
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['clean_text'])

# === Combine Features ===
X = hstack([X_tfidf, csr_matrix(binary_feats)])
y = df['Label']

# === Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# === Train Model ===
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# === Evaluate ===
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# === Save Model and Vectorizer ===
os.makedirs("../model", exist_ok=True)
joblib.dump(tfidf, "../model/tfidf_vectorizer.pkl")
joblib.dump(model, "../model/phishing_model.pkl")


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        58
           1       1.00      1.00      1.00        50

    accuracy                           1.00       108
   macro avg       1.00      1.00      1.00       108
weighted avg       1.00      1.00      1.00       108

Accuracy: 1.0


['../model/phishing_model.pkl']