In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("../dataset/phishing_email.csv")  # Adjust this line as needed

In [3]:
# First, split into 80% train_val and 20% test
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(
    df["text_combined"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

# Then, split 80% of that into 70% training and 10% validation
X_train, X_val, y_train, y_val = train_test_split(
    train_val_texts, train_val_labels, test_size=0.125, random_state=42, stratify=train_val_labels
)
print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test size: {len(test_texts)}")

Train size: 57739
Validation size: 8249
Test size: 16498


In [4]:
# Create the pipeline with TF-IDF and Linear SVM
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC(kernel='linear', probability=True, random_state=42, verbose=True))
])

In [8]:
param_grid = {
    'tfidf__max_features': [1000, 5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'svm__C': [0.1, 1, 10]
}
grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=True)
grid.fit(X_train, y_train)

print(f"Best hyperparameters: {grid.best_params_}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits


KeyboardInterrupt: 

In [None]:
final_model = grid.best_estimator_
final_model.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))

In [None]:
# Evaluate on validation set
val_preds = grid.predict(X_val)
print("\nValidation performance:")
print(classification_report(y_val, val_preds))

In [None]:
# Test set evaluation
test_preds = final_model.predict(test_texts)
test_probs = final_model.predict_proba(test_texts)[:, 1]

In [None]:
# Evaluate performance
accuracy = accuracy_score(test_labels, test_preds) * 100
precision = precision_score(test_labels, test_preds, pos_label=df['label'].unique()[1]) * 100
recall = recall_score(test_labels, test_preds, pos_label=df['label'].unique()[1]) * 100
f1 = f1_score(test_labels, test_preds, pos_label=df['label'].unique()[1]) * 100
auc = roc_auc_score(test_labels, test_probs) * 100

# Final results
print("\nFinal Test Results:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
print(f"AUC-ROC: {auc:.2f}")