In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

# Reportlab for PDF
from reportlab.lib.pagesizes import A4, landscape
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image

# -----------------------
# Paths (edit if necessary)
# -----------------------
BASE = r"D:\Darryl\Coding\s_p\data"
PROCESSED = os.path.join(BASE, "processed")
REPORT_DIR = os.path.join(BASE, "reports")
os.makedirs(REPORT_DIR, exist_ok=True)

CLUSTER_SUMMARY = os.path.join(PROCESSED, "cluster_summary_v2.csv")
CANDIDATE_SUMMARY = os.path.join(PROCESSED, "candidates_risk_summary.csv")  # produced earlier
MESSAGES_CLUSTERED = os.path.join(PROCESSED, "messages_with_clusters_v2.csv")

OUT_CLUSTER_CSV = os.path.join(REPORT_DIR, "cluster_report.csv")
OUT_CANDIDATE_CSV = os.path.join(REPORT_DIR, "candidate_report.csv")
OUT_ALERTS_CSV = os.path.join(REPORT_DIR, "alerts_feed.csv")
OUT_PDF = os.path.join(REPORT_DIR, "risk_summary.pdf")

# -----------------------
# Load inputs
# -----------------------
if not os.path.exists(CLUSTER_SUMMARY):
    raise FileNotFoundError(CLUSTER_SUMMARY + " not found")

df_clusters = pd.read_csv(CLUSTER_SUMMARY)
df_candidates = pd.read_csv(CANDIDATE_SUMMARY) if os.path.exists(CANDIDATE_SUMMARY) else None
df_msgs = pd.read_csv(MESSAGES_CLUSTERED) if os.path.exists(MESSAGES_CLUSTERED) else None

# -----------------------
# CSV Exports (cleaned)
# -----------------------
df_clusters.to_csv(OUT_CLUSTER_CSV, index=False)
print("Saved cluster CSV:", OUT_CLUSTER_CSV)

if df_candidates is not None:
    df_candidates.to_csv(OUT_CANDIDATE_CSV, index=False)
    print("Saved candidate CSV:", OUT_CANDIDATE_CSV)

# Create alerts feed: recent high risk messages (top 200 by date if available)
if df_msgs is not None:
    high_alerts = df_msgs[df_msgs["risk_label"] == "high"].copy()
    # try to parse date to sort; fallback if not parseable
    if "date" in high_alerts.columns:
        try:
            high_alerts["date_parsed"] = pd.to_datetime(high_alerts["date"], errors="coerce")
            high_alerts = high_alerts.sort_values("date_parsed", ascending=False)
        except Exception:
            high_alerts = high_alerts.sort_values("message_id", ascending=False)
    else:
        high_alerts = high_alerts.sort_values("message_id", ascending=False)
    high_alerts = high_alerts[["message_id", "date", "candidate_name_norm_simple", "text", "cluster_id", "heuristic_score"]].head(200)
    high_alerts.to_csv(OUT_ALERTS_CSV, index=False)
    print("Saved alerts CSV:", OUT_ALERTS_CSV)
else:
    print("messages_with_clusters_v2.csv missing; alerts feed not created.")

# -----------------------
# Make plots (save to images)
# -----------------------
plot1 = os.path.join(REPORT_DIR, "top_clusters_by_count.png")
plot2 = os.path.join(REPORT_DIR, "top_clusters_by_ratio.png")

# Top clusters by high count
df_top_by_count = df_clusters.sort_values("high_msgs", ascending=False).head(10)
plt.figure(figsize=(10,6))
plt.barh(range(len(df_top_by_count)), df_top_by_count["high_msgs"].values[::-1])
plt.yticks(range(len(df_top_by_count)), df_top_by_count["canonical_template"].values[::-1])
plt.xlabel("High-risk messages")
plt.title("Top 10 clusters by high-risk message count")
plt.tight_layout()
plt.savefig(plot1, dpi=150)
plt.close()

# Top clusters by high ratio (min size filter)
df_top_by_ratio = df_clusters[df_clusters["n_messages"] >= 5].sort_values("high_ratio", ascending=False).head(10)
plt.figure(figsize=(10,6))
plt.barh(range(len(df_top_by_ratio)), (df_top_by_ratio["high_ratio"]*100).values[::-1])
plt.yticks(range(len(df_top_by_ratio)), df_top_by_ratio["canonical_template"].values[::-1])
plt.xlabel("High-risk ratio (%)")
plt.title("Top 10 clusters by high-risk ratio (min 5 msgs)")
plt.tight_layout()
plt.savefig(plot2, dpi=150)
plt.close()

print("Saved plots:", plot1, plot2)

# -----------------------
# Build PDF
# -----------------------
styles = getSampleStyleSheet()
styleH = styles["Heading1"]
styleN = styles["BodyText"]
style_small = ParagraphStyle("small", parent=styles["Normal"], fontSize=9, leading=11)

doc = SimpleDocTemplate(OUT_PDF, pagesize=landscape(A4), rightMargin=30,leftMargin=30, topMargin=30,bottomMargin=18)
elements = []

# Title block
title = "ScamShield SEBI — Risk Summary"
elements.append(Paragraph(title, styleH))
elements.append(Spacer(1, 6))
meta = f"Generated: {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')}"
elements.append(Paragraph(meta, styleN))
elements.append(Spacer(1, 12))

# Section: Top clusters table
elements.append(Paragraph("Top 10 Clusters by High-Risk Message Count", styles["Heading2"]))
# prepare table data
tbl1 = [["cluster_id","n_messages","high_msgs","high_ratio","avg_score","example (truncated)"]]
for _, r in df_top_by_count.iterrows():
    example = (str(r.get("example_message") or "")).replace("\n"," ")[:120]
    tbl1.append([int(r["cluster_id"]), int(r["n_messages"]), int(r["high_msgs"]), f"{r['high_ratio']:.2f}", f"{r['avg_score']:.1f}", example])

t1 = Table(tbl1, colWidths=[50,60,60,60,60,400])
t1.setStyle(TableStyle([
    ('BACKGROUND',(0,0),(5,0),colors.HexColor("#d3d3d3")),
    ('VALIGN',(0,0),(-1,-1),'TOP'),
    ('GRID',(0,0),(-1,-1),0.5,colors.grey),
    ('FONTSIZE',(0,0),(-1,-1),8)
]))
elements.append(t1)
elements.append(Spacer(1,12))

# Insert plot image 1
elements.append(Paragraph("Chart: Top clusters by high-risk count", styles["Heading3"]))
elements.append(Image(plot1, width=700, height=300))
elements.append(Spacer(1,12))

# Section: Top clusters by ratio
elements.append(Paragraph("Top 10 Clusters by High-Risk Ratio (min 5 messages)", styles["Heading2"]))
tbl2 = [["cluster_id","n_messages","high_msgs","high_ratio","avg_score","example (truncated)"]]
for _, r in df_top_by_ratio.iterrows():
    example = (str(r.get("example_message") or "")).replace("\n"," ")[:120]
    tbl2.append([int(r["cluster_id"]), int(r["n_messages"]), int(r["high_msgs"]), f"{r['high_ratio']:.2f}", f"{r['avg_score']:.1f}", example])
t2 = Table(tbl2, colWidths=[50,60,60,60,60,400])
t2.setStyle(TableStyle([
    ('BACKGROUND',(0,0),(5,0),colors.HexColor("#d3d3d3")),
    ('VALIGN',(0,0),(-1,-1),'TOP'),
    ('GRID',(0,0),(-1,-1),0.5,colors.grey),
    ('FONTSIZE',(0,0),(-1,-1),8)
]))
elements.append(t2)
elements.append(Spacer(1,12))
elements.append(Image(plot2, width=700, height=300))
elements.append(Spacer(1,12))

# Section: Top candidates (if available)
if df_candidates is not None:
    elements.append(Paragraph("Top 10 Candidates by High-Risk Mentions", styles["Heading2"]))
    df_top_cand = df_candidates.sort_values("high_msgs", ascending=False).head(10) if "high_msgs" in df_candidates.columns else df_candidates.head(10)
    tbl3 = [["candidate","total_msgs","high_msgs","avg_score","status"]]
    for _, r in df_top_cand.iterrows():
        tbl3.append([r.get("candidate_name_norm_simple", r.get("candidate_name", "")),
                     int(r.get("total_msgs",0)), int(r.get("high_msgs",0)) if "high_msgs" in r else 0,
                     f"{r.get('avg_score',0):.1f}" if "avg_score" in r else "", r.get("status","")])
    t3 = Table(tbl3, colWidths=[200,60,60,60,100])
    t3.setStyle(TableStyle([
        ('BACKGROUND',(0,0),(4,0),colors.HexColor("#d3d3d3")),
        ('GRID',(0,0),(-1,-1),0.5,colors.grey),
        ('FONTSIZE',(0,0),(-1,-1),9)
    ]))
    elements.append(t3)
    elements.append(Spacer(1,12))

# Section: Top alerts snapshot
if df_msgs is not None and not high_alerts.empty:
    elements.append(Paragraph("Top Alerts (recent high-risk messages) — snapshot", styles["Heading2"]))
    # take top 10 for PDF
    top_alerts = high_alerts.head(10)
    tbl_alerts = [["message_id","date","candidate","cluster_id","score","text (truncated)"]]
    for _, r in top_alerts.iterrows():
        txt = (str(r.get("text") or "")).replace("\n"," ")[:140]
        tbl_alerts.append([int(r["message_id"]), r.get("date",""), r.get("candidate_name_norm_simple",""), int(r.get("cluster_id", -1)), f"{r.get('heuristic_score',0):.1f}", txt])
    ta = Table(tbl_alerts, colWidths=[50,80,120,60,50,400])
    ta.setStyle(TableStyle([
        ('BACKGROUND',(0,0),(5,0),colors.HexColor("#d3d3d3")),
        ('GRID',(0,0),(-1,-1),0.5,colors.grey),
        ('FONTSIZE',(0,0),(-1,-1),8)
    ]))
    elements.append(ta)
    elements.append(Spacer(1,12))

# Footer note
elements.append(Paragraph("Notes: This report is auto-generated. 'High' messages are flagged by heuristic + verification rules. Review templates and clusters manually before enforcement actions.", style_small))
elements.append(Spacer(1,12))

# Build PDF
doc.build(elements)
print("PDF report saved to:", OUT_PDF)


Saved cluster CSV: D:\Darryl\Coding\s_p\data\reports\cluster_report.csv
Saved candidate CSV: D:\Darryl\Coding\s_p\data\reports\candidate_report.csv
Saved alerts CSV: D:\Darryl\Coding\s_p\data\reports\alerts_feed.csv


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(plot1, dpi=150)
  plt.savefig(plot1, dpi=150)
  plt.savefig(plot1, dpi=150)
  plt.savefig(plot1, dpi=150)
  plt.savefig(plot1, dpi=150)
  plt.savefig(plot1, dpi=150)
  plt.savefig(plot1, dpi=150)
  plt.savefig(plot1, dpi=150)
  plt.savefig(plot1, dpi=150)
  plt.savefig(plot1, dpi=150)
  plt.savefig(plot1, dpi=150)
  plt.savefig(plot1, dpi=150)
  plt.savefig(plot1, dpi=150)
 

Saved plots: D:\Darryl\Coding\s_p\data\reports\top_clusters_by_count.png D:\Darryl\Coding\s_p\data\reports\top_clusters_by_ratio.png
PDF report saved to: D:\Darryl\Coding\s_p\data\reports\risk_summary.pdf


In [None]:
import os
import pandas as pd
from datetime import datetime
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors

# === Load Data ===
clusters = pd.read_csv(r"D:\Darryl\Coding\s_p\data\processed\cluster_summary_v2.csv")
messages = pd.read_csv(r"D:\Darryl\Coding\s_p\data\processed\messages_with_clusters_v2.csv")

# === Ensure reports folder exists ===
report_dir = r"D:\Darryl\Coding\s_p\reports"
os.makedirs(report_dir, exist_ok=True)

# === PDF Setup ===
report_path = os.path.join(report_dir, "final_report.pdf")
doc = SimpleDocTemplate(report_path, pagesize=(800, 1000))
styles = getSampleStyleSheet()
elements = []

# === Title ===
title = f"SEBI Project Risk Analysis Report\nGenerated on {datetime.now().strftime('%Y-%m-%d %H:%M')}"
elements.append(Paragraph(title, styles['Title']))
elements.append(Spacer(1, 20))

# === Risk Distribution ===
risk_counts = messages["risk_label"].value_counts().to_dict()
risk_text = "Risk Distribution:<br/>" + "<br/>".join([f"{k}: {v}" for k, v in risk_counts.items()])
elements.append(Paragraph(risk_text, styles['Normal']))
elements.append(Spacer(1, 20))

# === Top Clusters by High-Risk Ratio ===
top_clusters = clusters.sort_values("high_ratio", ascending=False).head(10)
elements.append(Paragraph("Top Clusters by High-Risk Ratio", styles['Heading2']))
elements.append(Spacer(1, 10))

cluster_table_data = [list(top_clusters.columns)] + top_clusters.values.tolist()
cluster_table = Table(cluster_table_data)
cluster_table.setStyle(TableStyle([
    ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
    ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
    ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
    ('GRID', (0, 0), (-1, -1), 0.5, colors.black),
]))
elements.append(cluster_table)
elements.append(Spacer(1, 20))

# === Sample High-Risk Messages ===
sample_msgs = messages[messages["risk_label"] == "high"].head(5)
elements.append(Paragraph("Sample High-Risk Messages", styles['Heading2']))
elements.append(Spacer(1, 10))

for _, row in sample_msgs.iterrows():
    msg = f"<b>Candidate:</b> {row['candidate_name_norm_simple']}<br/>" \
          f"<b>Date:</b> {row['date']}<br/>" \
          f"<b>Risk Label:</b> {row['risk_label']}<br/>" \
          f"<b>Score:</b> {row['heuristic_score']}<br/>" \
          f"<b>Message:</b> {row['text']}"
    elements.append(Paragraph(msg, styles['Normal']))
    elements.append(Spacer(1, 15))

# === Build PDF ===
doc.build(elements)
print(f"✅ PDF Report saved to: {report_path}")


✅ PDF Report saved to: D:\Darryl\Coding\SEBI_Project\reports\final_report.pdf
