In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_recall_curve,
    roc_curve,
    auc,
    classification_report
)

# Load evaluation results from ThreatKit offline eval
results_path = "threatkit/emailcheck/eval/eval_results.csv"
df = pd.read_csv(results_path)

# Basic sanity check
df.head()

In [None]:
plt.figure(figsize=(8, 5))

phish_scores = df[df["label"] == 1]["score_0_5"]
safe_scores  = df[df["label"] == 0]["score_0_5"]

plt.hist(safe_scores, bins=20, alpha=0.5, label="Safe (label=0)")
plt.hist(phish_scores, bins=20, alpha=0.5, label="Phishing (label=1)")

plt.xlabel("Safe score (0 = risky, 5 = safe)")
plt.ylabel("Count")
plt.title("Distribution of ThreatKit safe scores by class")
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_true = df["label"].values
y_pred = df["pred_phish"].values

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
rec = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1-score :", f1)
print()
print("Classification report:")
print(classification_report(y_true, y_pred, digits=4))

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Safe (0)", "Phish (1)"])
disp.plot()
plt.title("Confusion Matrix – ThreatKit Phishing Detector")
plt.show()

In [None]:
# Approximate phishing probability from safe score
safe = df["score_0_5"].values
p_phish = 1.0 - (safe / 5.0)

fpr, tpr, thresholds = roc_curve(y_true, p_phish)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, lw=2, label=f"ROC curve (AUC = {roc_auc:.3f})")
plt.plot([0, 1], [0, 1], linestyle="--", lw=1, label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("ROC Curve – ThreatKit Phishing Detector")
plt.legend()
plt.axis("square")
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_thresholds(scores, labels, thresholds):
    rows = []
    for t in thresholds:
        preds = (scores < t).astype(int)  # phishing if safe_score < t
        rows.append({
            "threshold": t,
            "accuracy": accuracy_score(labels, preds),
            "precision": precision_score(labels, preds, zero_division=0),
            "recall": recall_score(labels, preds, zero_division=0),
            "f1": f1_score(labels, preds, zero_division=0),
        })
    return pd.DataFrame(rows)

thresholds = np.arange(2.0, 4.6, 0.25)
sweep_df = evaluate_thresholds(df["score_0_5"].values, y_true, thresholds)
sweep_df

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(sweep_df["threshold"], sweep_df["accuracy"], marker="o", label="Accuracy")
plt.plot(sweep_df["threshold"], sweep_df["precision"], marker="o", label="Precision")
plt.plot(sweep_df["threshold"], sweep_df["recall"], marker="o", label="Recall")
plt.plot(sweep_df["threshold"], sweep_df["f1"], marker="o", label="F1-score")

plt.axvline(4.0, linestyle="--", label="Chosen threshold (4.0)")

plt.xlabel("Safe score threshold t (score < t ⇒ phishing)")
plt.ylabel("Metric value")
plt.title("Metric trade-offs vs threshold")
plt.legend()
plt.grid(True)
plt.show()