In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# === Paths ===
BASE = r"D:\Darryl\Coding\s_p\data"
summary_file = os.path.join(BASE, "processed", "cluster_summary_v2.csv")
report_dir = os.path.join(BASE, "reports")
os.makedirs(report_dir, exist_ok=True)

# === Load cluster summary ===
df = pd.read_csv(summary_file)
print(f"Loaded cluster summary: {len(df)} clusters")

# === Rank clusters ===
# By high-risk ratio
df_high_ratio = df.sort_values(["high_ratio", "n_messages"], ascending=[False, False])
# By high-risk count
df_high_count = df.sort_values("high_msgs", ascending=False)
# By average heuristic score
df_avg_score = df.sort_values("avg_score", ascending=False)

# === Save CSV reports ===
df_high_ratio.to_csv(os.path.join(report_dir, "clusters_ranked_by_high_ratio.csv"), index=False)
df_high_count.to_csv(os.path.join(report_dir, "clusters_ranked_by_high_count.csv"), index=False)
df_avg_score.to_csv(os.path.join(report_dir, "clusters_ranked_by_avg_score.csv"), index=False)
print("✅ Saved ranked reports (CSV) to:", report_dir)

# === Excel export with multiple sheets ===
excel_path = os.path.join(report_dir, "cluster_risk_reports.xlsx")
with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
    df_high_ratio.to_excel(writer, sheet_name="High_Ratio", index=False)
    df_high_count.to_excel(writer, sheet_name="High_Count", index=False)
    df_avg_score.to_excel(writer, sheet_name="Avg_Score", index=False)
print(f"✅ Excel report saved -> {excel_path}")

# === Visualization (Top 15 by high risk count) ===
top_n = 15
plt.figure(figsize=(12,6))
sns.barplot(
    data=df_high_count.head(top_n),
    x="high_msgs",
    y="canonical_template",
    palette="Reds_r"
)
plt.title(f"Top {top_n} High-Risk Clusters (by message count)")
plt.xlabel("High-Risk Messages")
plt.ylabel("Cluster Template")
plt.tight_layout()
plot_path = os.path.join(report_dir, "top_high_risk_clusters.png")
plt.savefig(plot_path, dpi=150)
plt.close()
print(f"📊 Saved plot -> {plot_path}")

# === Visualization (Top 15 by high risk ratio, only clusters with >5 messages) ===
plt.figure(figsize=(12,6))
sns.barplot(
    data=df_high_ratio[df_high_ratio["n_messages"] > 5].head(top_n),
    x="high_ratio",
    y="canonical_template",
    palette="Reds_r"
)
plt.title(f"Top {top_n} High-Risk Clusters (by ratio, min 5 msgs)")
plt.xlabel("High-Risk Ratio")
plt.ylabel("Cluster Template")
plt.tight_layout()
plot_path = os.path.join(report_dir, "top_high_risk_clusters_ratio.png")
plt.savefig(plot_path, dpi=150)
plt.close()
print(f"📊 Saved plot -> {plot_path}")


Loaded cluster summary: 1124 clusters
✅ Saved ranked reports (CSV) to: D:\Darryl\Coding\s_p\data\reports
✅ Excel report saved -> D:\Darryl\Coding\s_p\data\reports\cluster_risk_reports.xlsx



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(plot_path, dpi=150)
  plt.savefig(plot_path, dpi=150)
  plt.savefig(plot_path, dpi=150)
  plt.savefig(plot_path, dpi=150)
  plt.savefig(plot_path, dpi=150)
  plt.savefig(plot_path, dpi=150)
  plt

📊 Saved plot -> D:\Darryl\Coding\s_p\data\reports\top_high_risk_clusters.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()

📊 Saved plot -> D:\Darryl\Coding\s_p\data\reports\top_high_risk_clusters_ratio.png
