In [1]:
import os, re
import pandas as pd
from datetime import datetime
from collections import defaultdict

# ---------- CONFIG: file paths ----------
msgs_in = r"D:\Darryl\Coding\s_p\data\processed\messages_with_verification.csv"
msgs_out = r"D:\Darryl\Coding\s_p\data\processed\messages_with_risk_v1.csv"
sample_out = os.path.join(os.path.dirname(msgs_out), "analysis_sample_messages_risk.csv")

# fallback if path not present (e.g., Linux environment)
if not os.path.exists(os.path.dirname(msgs_out)):
    msgs_in = "/mnt/data/messages_with_verification.csv" if os.path.exists("/mnt/data/messages_with_verification.csv") else "/mnt/data/sebi_groups_messages_preprocessed_final_v2.csv"
    msgs_out = "/mnt/data/messages_with_risk_v1.csv"
    sample_out = "/mnt/data/analysis_sample_messages_risk.csv"

print("Input messages:", msgs_in)
print("Output messages:", msgs_out)

# ---------- Load messages ----------
df = pd.read_csv(msgs_in, low_memory=False)
print("Loaded rows:", len(df))

# choose a text column to analyze; prefer 'text_for_model' then 'text_clean' then 'text'
for col_try in ["text_for_model","text_clean","text","text_norm","text_raw"]:
    if col_try in df.columns:
        text_col = col_try
        break
print("Using text column:", text_col)

# ---------- Scam keyword lists (English + Hindi/Hinglish) ----------
# Keep this list conservative; expand when needed.
SCAM_KEYWORDS = [
    # typical scam phrases/promises
    r"\b100% returns\b", r"\bguaranteed returns\b", r"\bguaranteed\b", r"\bno risk\b", r"\brisk[- ]free\b",
    r"\bget rich\b", r"\bmake money fast\b", r"\bmake quick profits\b", r"\bdouble your money\b",
    r"\btrust me\b", r"\bDM for calls\b", r"\bDM for tips\b", r"\bcontact on whatsapp\b", r"\bwhatsapp\b",
    r"\bjoin my group\b", r"\binside info\b", r"\binsider tips\b", r"\bsecret strategy\b",
    r"\bbuy now\b", r"\bsell now\b", r"\bhot stock\b", r"\bcan't miss\b", r"\bcan't lose\b",
    # payment / collection patterns (UPI strings are probably redacted, but check words)
    r"\bupi\b", r"\bpaytm\b", r"\bphonepe\b", r"\bgpay\b", r"\bgoogle pay\b", r"\bcollect\b", r"\baccount number\b",
    # urgency and FOMO
    r"\blast chance\b", r"\blimited spots\b", r"\blimited time\b", r"\bact now\b", r"\bcall now\b",
    # disclaimers mimicry (fake regulatory claims)
    r"\bSEBI registered\b", r"\bSEBI reg\b", r"\bSEBI registered advisor\b", r"\bregistered with sebi\b",
    # Hindi/Hinglish (simple tokens)
    r"लाख", r"कमाओ", r"सुनहरा मौका", r"100% लाभ", r"बिना जोखिम", r"निश्चित लाभ", r"मुनाफ़ा", r"सुन", r"कमाई",
    r"paise", r"₹", r"rs\.", r"rs "
]

SCAM_KEYWORDS = [re.compile(k, flags=re.IGNORECASE) for k in SCAM_KEYWORDS]

# ---------- Heuristic scoring weights ----------
WEIGHTS = {
    "keyword": 40,        # presence of scam keywords (max)
    "promo": 10,          # is_promo flag
    "unverified": 25,     # candidate not found in SEBI (big bump)
    "trade_signal": 15,   # contains explicit trade_action (calls to trade)
    "phone_url": 10       # presence of phone/url (likely redacted; for completeness)
}
# ensure total possible may exceed 100 — we'll clip to 100 at end.

# helper regexes
url_re = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE)
phone_re = re.compile(r'(\+91[\-\s]?[6-9]\d{9}|\b[6-9]\d{9}\b|\+?\d{7,15})')

# normalization helper
def text_safe(s):
    if pd.isna(s):
        return ""
    return str(s)

# ---------- detection logic per message ----------
def analyze_message(row):
    text = text_safe(row.get(text_col, ""))
    res = {
        "matched_keywords": [],
        "heuristic_score": 0.0,
        "reasons": []
    }

    # 1) keyword matches
    keyword_hits = []
    for pat in SCAM_KEYWORDS:
        if pat.search(text):
            keyword_hits.append(pat.pattern)
    if keyword_hits:
        # weight scales with number of unique keywords found
        kw_points = min(len(keyword_hits) / 5.0 * WEIGHTS["keyword"], WEIGHTS["keyword"])  # 5 or more keywords => full points
        res["heuristic_score"] += kw_points
        res["matched_keywords"] = keyword_hits
        res["reasons"].append(f"Keywords matched: {', '.join(keyword_hits[:4])}")

    # 2) is_promo feature
    if str(row.get("is_promo", "")).strip().lower() in ("1","true","yes","y","t"):
        res["heuristic_score"] += WEIGHTS["promo"]
        res["reasons"].append("Marked promo")

    # 3) phone / url presence (if not redacted)
    if url_re.search(text):
        res["heuristic_score"] += WEIGHTS["phone_url"]
        res["reasons"].append("Contains URL")
    # phones_maybe column may exist but text phones likely redacted
    if "phones_maybe" in row and not pd.isna(row.get("phones_maybe")) and str(row.get("phones_maybe")).strip():
        res["heuristic_score"] += WEIGHTS["phone_url"]
        res["reasons"].append("Phone-like token present")

    # 4) trade signal (calls to buy/sell with targets) — use trade_action or trade_* fields if present
    trade_action = str(row.get("trade_action", "")).strip().lower()
    if trade_action in ("buy","sell","call","put","long","short"):
        res["heuristic_score"] += WEIGHTS["trade_signal"]
        res["reasons"].append(f"Explicit trade_action: {trade_action}")
    else:
        # fallback: look for "buy" "sell" tokens in text
        if re.search(r'\bbuy\b|\bsell\b|\bexit\b|\bbook profit\b|\bsl\b|\btgt\b', text, flags=re.IGNORECASE):
            res["heuristic_score"] += WEIGHTS["trade_signal"]/2.0
            res["reasons"].append("Trade-like tokens in text")

    # 5) SEBI verification status bump
    status = str(row.get("status", "")).strip().lower()
    if status == "unverified":
        res["heuristic_score"] += WEIGHTS["unverified"]
        res["reasons"].append("Advisor not SEBI-verified")

    # 6) short message length or emoji-heavy promo (suspicious)
    if "emoji_count" in row and not pd.isna(row.get("emoji_count")) and int(float(row.get("emoji_count") or 0)) > 3:
        res["heuristic_score"] += 3
        res["reasons"].append("Many emojis")

    # 7) clamp and finalize
    score = float(res["heuristic_score"])
    score = max(0.0, min(100.0, score))
    res["heuristic_score"] = round(score, 2)

    # risk label
    if score >= 70:
        label = "high"
    elif score >= 35:
        label = "medium"
    else:
        label = "low"
    res["risk_label"] = label

    # short human reasons (limit to 3 bullets)
    if res["reasons"]:
        res["explain"] = " | ".join(res["reasons"][:3])
    else:
        res["explain"] = "No strong heuristics matched"

    return res

# ---------- Run analysis on all messages ----------
print("Analyzing messages (rule-based)...")
out_rows = []
for i, row in df.iterrows():
    a = analyze_message(row)
    out_rows.append({
        "message_id": row.get("message_id", None),
        "heuristic_score": a["heuristic_score"],
        "risk_label": a["risk_label"],
        "matched_keywords": ";".join(a["matched_keywords"]) if a["matched_keywords"] else "",
        "explain": a["explain"]
    })
    if (i+1) % 500 == 0:
        print(f" processed {i+1} / {len(df)}")

df_out = pd.DataFrame(out_rows).set_index("message_id")
# merge back into original df (on message_id)
if "message_id" in df.columns:
    df = df.set_index("message_id").merge(df_out, left_index=True, right_index=True, how="left").reset_index()
else:
    df = pd.concat([df, df_out.reset_index(drop=True)], axis=1)

# ---------- save outputs ----------
df.to_csv(msgs_out, index=False)
df.head(200).to_csv(sample_out, index=False)
print("Saved messages with risk:", msgs_out)
print("Saved sample:", sample_out)

# quick summary
print("\nRisk distribution:")
print(df['risk_label'].value_counts(dropna=False))

print("\nTop messages flagged high (sample):")
print(df[df['risk_label']=='high'][['message_id','candidate_name','candidate_name_norm_simple','risk_label','heuristic_score','explain']].head(15).to_string(index=False))


Input messages: D:\Darryl\Coding\s_p\data\processed\messages_with_verification.csv
Output messages: D:\Darryl\Coding\s_p\data\processed\messages_with_risk_v1.csv
Loaded rows: 4098
Using text column: text_for_model
Analyzing messages (rule-based)...
 processed 500 / 4098
 processed 1000 / 4098
 processed 1500 / 4098
 processed 2000 / 4098
 processed 2500 / 4098
 processed 3000 / 4098
 processed 3500 / 4098
 processed 4000 / 4098
Saved messages with risk: D:\Darryl\Coding\s_p\data\processed\messages_with_risk_v1.csv
Saved sample: D:\Darryl\Coding\s_p\data\processed\analysis_sample_messages_risk.csv

Risk distribution:
risk_label
medium    4550
low        219
high        23
Name: count, dtype: int64

Top messages flagged high (sample):
 message_id  candidate_name candidate_name_norm_simple risk_label  heuristic_score                                                                                      explain
     5606.0   PowerOfStocks              powerofstocks       high             71.

In [None]:

import pandas as pd
import re

# -----------------------
# File paths
# -----------------------
messages_path = r"D:\Darryl\Coding\s_p\data\processed\messages_with_verification.csv"
output_messages_path = r"D:\Darryl\Coding\s_p\data\processed\messages_with_risk_v4.csv"
output_sample_path = r"D:\Darryl\Coding\s_p\data\processed\analysis_sample_messages_risk_v4.csv"

# -----------------------
# Load data
# -----------------------
df = pd.read_csv(messages_path, low_memory=False)
print(f"Loaded rows: {len(df)}")

# -----------------------
# Keyword patterns (English + Hindi/Hinglish)
# -----------------------

# Strong promo/pump signals
promo_keywords = [
    # English
    r"\bjoin now\b", r"\bsubscribe\b", r"\bwhatsapp\b", r"\btelegram\b",
    r"\boffer\b", r"\bprofit\b", r"\bjackpot\b", r"\bbuy now\b", 
    r"\bsure shot\b", r"\bhot stock\b", r"\bcall now\b", r"\bmultibagger\b",
    r"\btarget hits\b", r"\bguarantee\b",

    # Hindi/Hinglish
    r"\bbhai log\b", r"\bsure shot call\b", r"\bpaise double\b",
    r"\blakhpati banoge\b", r"\bkarodpati\b", r"\bkal ka tezi stock\b",
    r"\b100% return\b", r"\bchhupaa hua gem\b", r"\bsuvarna avsar\b",
    r"\bsuvarn avsar\b", r"\btip\b", r"\btelegram join karo\b",
    r"\bsignal\b", r"\bguranteed\b", r"\bcaller id\b"
]

# Trading action tokens
trade_keywords = [
    r"\bbuy\b", r"\bsell\b", r"\btarget\b", r"\bstoploss\b", 
    r"\bintraday\b", r"\bshort term\b", r"\blong term\b",
    r"\bhit tgt\b", r"\btgt\b", r"\bsafe call\b"
]

# Money-related
money_patterns = [
    r"\brs\b", r"\b₹\b", r"\b\d{3,}\b", r"\b1\.5 lakh\b", r"\blakh\b", r"\bcr\b"
]

promo_re = re.compile("|".join(promo_keywords), re.IGNORECASE)
trade_re = re.compile("|".join(trade_keywords), re.IGNORECASE)
money_re = re.compile("|".join(money_patterns), re.IGNORECASE)

# -----------------------
# Scoring function
# -----------------------
def score_message(text, verified_status, confidence):
    if not isinstance(text, str):
        return 0, "No text", "low"

    score = 0
    reasons = []

    # Promo signals (strong weight)
    if promo_re.search(text):
        score += 40
        reasons.append("Promo keywords detected")

    # Trade signals
    trade_matches = len(trade_re.findall(text))
    if trade_matches > 0:
        score += 10 * min(trade_matches, 3)
        reasons.append(f"Trade tokens detected ({trade_matches})")

    # Money references
    if money_re.search(text):
        score += 20
        reasons.append("Money-related token")

    # Phone/WhatsApp-like numbers
    if re.search(r"\+?\d{8,}", text):
        score += 25
        reasons.append("Phone-like token present")

    # Links
    if "http" in text or "www" in text:
        score += 15
        reasons.append("Link detected")

    # -----------------------
    # Confidence-based adjustment
    # -----------------------
    if verified_status == "unverified":
        if confidence >= 90:
            score += 20
            reasons.append("Unverified but very close to registry name (⚠️ suspicious)")
        elif confidence >= 70:
            score += 10
            reasons.append("Unverified but somewhat close to registry name")

    # Bound score
    score = min(score, 100)

    # Map to risk labels
    if score >= 70:
        label = "high"
    elif score >= 40:
        label = "medium"
    else:
        label = "low"

    return score, " | ".join(reasons), label

# -----------------------
# Apply to messages
# -----------------------
risk_scores, explanations, labels = [], [], []

for idx, row in df.iterrows():
    sc, exp, label = score_message(
        row.get("text_for_model", ""),
        row.get("status", "unverified"),
        row.get("confidence", 0)
    )
    risk_scores.append(sc)
    explanations.append(exp)
    labels.append(label)

df["heuristic_score"] = risk_scores
df["explain"] = explanations
df["risk_label"] = labels

# -----------------------
# Save results
# -----------------------
df.to_csv(output_messages_path, index=False)
df.sample(200).to_csv(output_sample_path, index=False)

print(f"✅ Saved messages with risk: {output_messages_path}")
print(f"✅ Saved sample: {output_sample_path}")

# -----------------------
# Summary
# -----------------------
print("\nRisk distribution:")
print(df["risk_label"].value_counts())

print("\nTop messages flagged high (sample):")
print(df[df["risk_label"] == "high"].head(15)[
    ["message_id","candidate_name","candidate_name_norm_simple","risk_label","heuristic_score","explain"]
])
