In [1]:
#!/usr/bin/env python3
# üìä evaluation.ipynb ‚Äî visualize performance of hybrid fake news model

import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    confusion_matrix, roc_curve, auc,
    precision_recall_curve, classification_report,
    accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
)

# ===============================
# 1Ô∏è‚É£ Load model + data
# ===============================
MODEL_PATH = "data/model/hybrid_model.pkl"
TRAIN_PATH = "data/features/graph_features_train.csv"
TEST_PATH = "data/features/graph_features_test.csv"

# Ensure model exists
assert os.path.exists(MODEL_PATH), "Model file not found!"
rf, xgb, meta, scaler, best_thresh = joblib.load(MODEL_PATH)

print(f"‚úÖ Model loaded from {MODEL_PATH}")
print(f"Using threshold: {best_thresh:.3f}")

# Load merged data
text_emb = pd.read_csv("data/model/text_embeddings.csv")
graph_train = pd.read_csv(TRAIN_PATH)
graph_test = pd.read_csv(TEST_PATH)

print(f"Text embeddings: {text_emb.shape}")
print(f"Graph train: {graph_train.shape}, test: {graph_test.shape}")

# ===============================
# 2Ô∏è‚É£ Merge text + graph features
# ===============================
merged_test = pd.merge(
    text_emb, graph_test, left_on="id", right_on="node_id", how="inner"
)
merged_test = merged_test.dropna(subset=["label"]).reset_index(drop=True)

# Identify feature columns
non_feature_cols = ["id", "source", "node_id", "label"]
X_test = merged_test.drop(columns=[c for c in non_feature_cols if c in merged_test.columns])
y_test = merged_test["label"]

X_scaled = scaler.transform(X_test)

# ===============================
# 3Ô∏è‚É£ Make predictions
# ===============================
rf_prob = rf.predict_proba(X_scaled)[:, 1]
xgb_prob = xgb.predict_proba(X_scaled)[:, 1]
meta_input = np.vstack([rf_prob, xgb_prob]).T
meta_prob = meta.predict_proba(meta_input)[:, 1]
y_pred = (meta_prob > best_thresh).astype(int)

# ===============================
# 4Ô∏è‚É£ Compute Metrics
# ===============================
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, meta_prob)
precisions, recalls, _ = precision_recall_curve(y_test, meta_prob)
pr_auc = auc(recalls, precisions)

print("\n=== üìä Classification Report ===")
print(classification_report(y_test, y_pred))
print(f"‚úÖ Accuracy: {acc:.3f}")
print(f"‚úÖ F1-score: {f1:.3f}")
print(f"‚úÖ ROC-AUC: {roc_auc:.3f}")
print(f"‚úÖ PR-AUC: {pr_auc:.3f}")

# ===============================
# 5Ô∏è‚É£ Confusion Matrix
# ===============================
plt.figure(figsize=(5,4))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# ===============================
# 6Ô∏è‚É£ ROC Curve
# ===============================
fpr, tpr, _ = roc_curve(y_test, meta_prob)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0,1], [0,1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

# ===============================
# 7Ô∏è‚É£ Precision-Recall Curve
# ===============================
plt.figure(figsize=(6,5))
plt.plot(recalls, precisions, label=f"PR-AUC = {pr_auc:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.show()

# ===============================
# 8Ô∏è‚É£ Feature Importance (RF)
# ===============================
importances = rf.feature_importances_
indices = np.argsort(importances)[-20:]
plt.figure(figsize=(8,5))
plt.barh(range(len(indices)), importances[indices], align="center")
plt.yticks(range(len(indices)), np.array(X_test.columns)[indices])
plt.title("Top 20 RandomForest Feature Importances")
plt.xlabel("Importance")
plt.show()

# ===============================
# 9Ô∏è‚É£ Summary
# ===============================
print("\n=== üß© Model Summary ===")
print(f"Samples tested: {len(y_test)}")
print(f"Fake label threshold: {best_thresh:.3f}")
print(f"Accuracy: {acc:.3f}")
print(f"F1 (Fake): {f1:.3f}")
print(f"ROC-AUC: {roc_auc:.3f}")
print(f"PR-AUC: {pr_auc:.3f}")


AssertionError: Model file not found!