In [2]:
import os
import re
import unicodedata
import math
import pandas as pd

# ---------- CONFIG: change these if your files are elsewhere ----------
# Example Windows paths you printed earlier; if running in same project, these should work.
msgs_in_candidates = [
    r"D:\Darryl\Coding\s_p\data\processed\sebi_groups_messages_preprocessed_final.csv",
    "/mnt/data/sebi_groups_messages_preprocessed_final.csv",  # fallback if running in Linux env
]
# Output path for cleaned messages (v2)
msgs_out = r"D:\Darryl\Coding\s_p\data\processed\sebi_groups_messages_preprocessed_final_v2.csv"
# If you want Linux fallback:
if not os.path.exists(os.path.dirname(msgs_out)):
    msgs_out = "/mnt/data/sebi_groups_messages_preprocessed_final_v2.csv"

# Try to find an existing input file from the candidates list
msgs_in = None
for p in msgs_in_candidates:
    if os.path.exists(p):
        msgs_in = p
        break
if msgs_in is None:
    raise FileNotFoundError(f"Could not find messages CSV in candidate paths. Edit msgs_in_candidates to point to your file.")

print("Using messages input:", msgs_in)
print("Will write cleaned messages to:", msgs_out)

# ---------- Helper normalization functions (same style as registries) ----------
def normalize_text_for_matching(s):
    if s is None or (isinstance(s, float) and math.isnan(s)):
        return ""
    s = str(s).strip()
    s = unicodedata.normalize("NFKC", s)
    # keep Devanagari block if present, remove punctuation, collapse whitespace
    s = re.sub(r"[^\w\s\u0900-\u097F]", " ", s, flags=re.UNICODE)
    s = re.sub(r"\s+", " ", s)
    return s.lower()

def simplify_org_suffixes(s):
    if not s:
        return s
    s2 = re.sub(r"\b(pvt|pvt\.|ltd|ltd\.|private|limited|llp|inc|corp|co|company|pvtltd|pvtltd)\b",
                " ", s, flags=re.I)
    s2 = re.sub(r"\s+", " ", s2).strip()
    return s2

# ---------- Load messages CSV (be forgiving with dtype) ----------
df = pd.read_csv(msgs_in, low_memory=False)

print("Loaded messages rows:", len(df))
print("Columns detected (first 60):", df.columns.tolist()[:60])

# ---------- Candidate fields detection ----------
# prefer these columns if present; fallback gracefully
def find_col(df, keywords):
    for col in df.columns:
        lc = col.lower()
        if any(k in lc for k in keywords):
            return col
    return None

col_sender_username = find_col(df, ["sender_username","username","handle"])
col_sender_first = find_col(df, ["sender_first_name","first_name","firstname","sender_first"])
col_sender_last = find_col(df, ["sender_last_name","last_name","lastname","sender_last"])
col_sender_phone = find_col(df, ["sender_phone","phone","mobile","telephone"])
col_channel = find_col(df, ["channel_norm","channel","group","chat","group_name","channel_name"])
col_text = find_col(df, ["text","message","msg","body"])

print("Detected columns:")
print(" - sender_username:", col_sender_username)
print(" - sender_first:", col_sender_first)
print(" - sender_last:", col_sender_last)
print(" - sender_phone:", col_sender_phone)
print(" - channel/group:", col_channel)
print(" - text:", col_text)

# ---------- Build sender_fullname and candidate_name ------
def safe_get(row, col):
    if col is None or col not in row or pd.isna(row[col]):
        return ""
    return str(row[col]).strip()

# Build sender_fullname
df["sender_first_str"] = df[col_sender_first].astype(str).fillna("") if col_sender_first else ""
df["sender_last_str"]  = df[col_sender_last].astype(str).fillna("") if col_sender_last else ""
def combine_fullname(r):
    a = safe_get(r, col_sender_first)
    b = safe_get(r, col_sender_last)
    if a and b:
        return (a + " " + b).strip()
    return a or b or ""

df["sender_fullname"] = df.apply(combine_fullname, axis=1)

# Candidate fields (stringified)
df["sender_username_str"] = df[col_sender_username].astype(str).fillna("") if col_sender_username else ""
df["channel_norm_str"] = df[col_channel].astype(str).fillna("") if col_channel else ""
# fallback: some files have 'group' or 'channel' exact names
if not df["channel_norm_str"].any() and "group" in df.columns:
    df["channel_norm_str"] = df["group"].astype(str).fillna("")

# Choose candidate_name per row in priority order:
# 1. username (if non-empty and not a generic 'nan'/'None')
# 2. sender_fullname
# 3. channel/group name
def choose_candidate_name(row):
    for colname, label in [("sender_username_str","username"), ("sender_fullname","fullname"), ("channel_norm_str","channel")]:
        val = row.get(colname, "")
        if val and str(val).lower() not in ("nan","none","nan.0","", "na"):
            return val, label
    return "", "none"

candidates = df.apply(lambda r: choose_candidate_name(r), axis=1)
df["candidate_name"], df["candidate_name_source"] = zip(*candidates)

# Normalize candidate name variants
df["candidate_name_norm"] = df["candidate_name"].apply(normalize_text_for_matching)
df["candidate_name_norm_simple"] = df["candidate_name_norm"].apply(simplify_org_suffixes)

# ---------- Summary stats & sample output ----------
n_candidates = df["candidate_name"].astype(bool).sum()
pct_with_candidate = n_candidates / len(df) * 100.0
unique_candidates = df["candidate_name_norm_simple"].nunique()

print(f"Messages with at least one candidate name: {n_candidates}/{len(df)} ({pct_with_candidate:.2f}%)")
print("Unique candidate_name_norm_simple values:", unique_candidates)

# Show top candidate names (frequency)
top_candidates = df["candidate_name_norm_simple"].value_counts().head(30).reset_index()
top_candidates.columns = ["candidate_name_norm_simple","count"]
print("\nTop candidate names (sample):")
print(top_candidates.to_string(index=False))

# Write cleaned messages file
df.to_csv(msgs_out, index=False)
print("\nWrote cleaned messages file to:", msgs_out)

# Save a small sample for quick review (first 500 rows)
sample_path = os.path.join(os.path.dirname(msgs_out), "analysis_sample_messages_candidates.csv")
df[["candidate_name","candidate_name_source","candidate_name_norm","candidate_name_norm_simple","text"]].head(500).to_csv(sample_path, index=False)
print("Wrote sample for review:", sample_path)

# Show a few example rows where candidate_name exists
examples = df[df["candidate_name"].astype(bool)].head(10)[["candidate_name","candidate_name_source","candidate_name_norm","candidate_name_norm_simple","text"]]
print("\nExamples (first 10 rows with candidate):")
print(examples.to_string(index=False))


Using messages input: D:\Darryl\Coding\s_p\data\processed\sebi_groups_messages_preprocessed_final.csv
Will write cleaned messages to: D:\Darryl\Coding\s_p\data\processed\sebi_groups_messages_preprocessed_final_v2.csv
Loaded messages rows: 4098
Columns detected (first 60): ['message_id', 'date', 'chat_id', 'sender_id', 'sender_username', 'sender_first_name', 'sender_last_name', 'sender_phone', 'text', 'views', 'forwards', 'reply_to_msg_id', 'media_type', 'has_hyperlink', 'group', 'text_raw', 'text_norm', 'text_clean', 'text_redacted', 'text_for_model', 'urls', 'mentions', 'hashtags', 'cashtags', 'emails', 'phones_maybe', 'emojis', 'is_system_like', 'char_count', 'word_count', 'emoji_count', 'is_promo', 'message_id_norm', 'channel_norm', 'views_norm', 'views_bucket', 'date_parsed_ist', 'hour', 'dow', 'month', 'year', 'trade_action', 'trade_strikes', 'trade_targets', 'trade_stoploss']
Detected columns:
 - sender_username: sender_username
 - sender_first: sender_first_name
 - sender_last: 