In [3]:
import pandas as pd

# -----------------------
# File paths
# -----------------------
messages_path = r"D:\Darryl\Coding\s_p\data\processed\messages_with_risk_v4.csv"
output_candidates_path = r"D:\Darryl\Coding\s_p\data\processed\candidates_risk_summary.csv"
output_groups_path = r"D:\Darryl\Coding\s_p\data\processed\groups_risk_summary.csv"

# -----------------------
# Load data
# -----------------------
df = pd.read_csv(messages_path, low_memory=False)
print(f"Loaded rows: {len(df)}")

# -----------------------
# Candidate-level risk summary
# -----------------------
cand_summary = (
    df.groupby("candidate_name_norm_simple")
    .agg(
        total_msgs=("message_id", "count"),
        high_msgs=("risk_label", lambda x: (x == "high").sum()),
        med_msgs=("risk_label", lambda x: (x == "medium").sum()),
        low_msgs=("risk_label", lambda x: (x == "low").sum()),
        avg_score=("heuristic_score", "mean"),
        max_score=("heuristic_score", "max"),
        status=("status", lambda x: x.mode().iat[0] if not x.mode().empty else "unknown"),
        registry_type=("registry_type", lambda x: x.mode().iat[0] if not x.mode().empty else None),
        registration_no=("registration_no", lambda x: x.mode().iat[0] if not x.mode().empty else None),
    )
    .reset_index()
)

# Add risk ratio
cand_summary["high_risk_ratio"] = cand_summary["high_msgs"] / cand_summary["total_msgs"]

# -----------------------
# Group/channel-level risk summary
# -----------------------
group_summary = (
    df.groupby("chat_id")
    .agg(
        total_msgs=("message_id", "count"),
        high_msgs=("risk_label", lambda x: (x == "high").sum()),
        med_msgs=("risk_label", lambda x: (x == "medium").sum()),
        low_msgs=("risk_label", lambda x: (x == "low").sum()),
        avg_score=("heuristic_score", "mean"),
        max_score=("heuristic_score", "max"),
        unique_candidates=("candidate_name_norm_simple", pd.Series.nunique)
    )
    .reset_index()
)

group_summary["high_risk_ratio"] = group_summary["high_msgs"] / group_summary["total_msgs"]

# -----------------------
# Save results
# -----------------------
cand_summary.to_csv(output_candidates_path, index=False)
group_summary.to_csv(output_groups_path, index=False)

print(f"✅ Saved candidate-level risk summary: {output_candidates_path}")
print(f"✅ Saved group-level risk summary: {output_groups_path}")

# -----------------------
# Show samples
# -----------------------
print("\nTop candidates by high-risk ratio:")
print(cand_summary.sort_values("high_risk_ratio", ascending=False).head(10))

print("\nTop groups by high-risk ratio:")
print(group_summary.sort_values("high_risk_ratio", ascending=False).head(10))


Loaded rows: 4098
✅ Saved candidate-level risk summary: D:\Darryl\Coding\s_p\data\processed\candidates_risk_summary.csv
✅ Saved group-level risk summary: D:\Darryl\Coding\s_p\data\processed\groups_risk_summary.csv

Top candidates by high-risk ratio:
     candidate_name_norm_simple  total_msgs  high_msgs  med_msgs  low_msgs  \
8                       eqwires         117         57         1        59   
11                     intradat         175         66        17        92   
12  intradaymatchsebiregistered         123          7        23        93   
19           sharekhan_official          18          1         3        14   
9               everydayprofits         200          9        38       153   
17                  patelwealth         292         12        38       242   
6                 deltatrading1         137          5         7       125   
20              sharesnservices         193          6        42       145   
24                   thefinberg         168     