In [2]:
import pandas as pd
from rapidfuzz import fuzz, process  # faster & modern replacement for fuzzywuzzy

# -----------------------
# File paths
# -----------------------
messages_path = r"D:\Darryl\Coding\s_p\data\processed\sebi_groups_messages_preprocessed_final_v2.csv"
ia_path = r"D:\Darryl\Coding\s_p\data\processed\sebi_investment_advisors_cleaned_v2.csv"
ra_path = r"D:\Darryl\Coding\s_p\data\processed\sebi_research_analysts_cleaned_v2.csv"

output_messages_path = r"D:\Darryl\Coding\s_p\data\processed\messages_with_verification.csv"
output_candidates_path = r"D:\Darryl\Coding\s_p\data\processed\candidates_verification_summary.csv"

# -----------------------
# Load data
# -----------------------
df_msg = pd.read_csv(messages_path, low_memory=False)
df_ia = pd.read_csv(ia_path, low_memory=False)
df_ra = pd.read_csv(ra_path, low_memory=False)

# -----------------------
# Build master registry lookup
# -----------------------
df_ia['registry_type'] = "IA"
df_ra['registry_type'] = "RA"
df_reg = pd.concat([df_ia, df_ra], ignore_index=True)

registry_names = df_reg['name_norm_simple'].dropna().unique().tolist()

# -----------------------
# Alias dictionary (all normalized to simple form)
# -----------------------
alias_dict = {
    "angeloneadvisory": "angel one limited",
    "angel one": "angel one limited",
    "motilaloswal": "motilal oswal financial services",
    "sharekhan_official": "sharekhan limited",
    "stockphoenix": "stock phoenix"
}

# -----------------------
# Matching function
# -----------------------
def match_candidate(candidate: str, registry_names, df_reg, threshold=85):
    if not isinstance(candidate, str) or candidate.strip() == "":
        return None

    candidate_lower = candidate.lower().strip()

    # 0. Alias dictionary override
    if candidate_lower in alias_dict:
        target = alias_dict[candidate_lower].lower().strip()
        alias_match = df_reg[df_reg['name_norm_simple'] == target]
        if not alias_match.empty:
            row = alias_match.iloc[0].to_dict()
            return {
                "match_type": "alias",
                "matched_name": row["name_norm_simple"],
                "registration_no": row["registration_no"],
                "registry_type": row["registry_type"],
                "confidence": 100,
                "days_to_expiry": row.get("days_to_expiry", None)
            }

    # 1. Exact match
    exact_match = df_reg[df_reg['name_norm_simple'] == candidate_lower]
    if not exact_match.empty:
        row = exact_match.iloc[0].to_dict()
        return {
            "match_type": "exact",
            "matched_name": row["name_norm_simple"],
            "registration_no": row["registration_no"],
            "registry_type": row["registry_type"],
            "confidence": 100,
            "days_to_expiry": row.get("days_to_expiry", None)
        }

    # 2. Fuzzy match
    best_match, score, _ = process.extractOne(candidate_lower, registry_names, scorer=fuzz.token_sort_ratio)
    if score >= threshold:
        fuzzy_row = df_reg[df_reg['name_norm_simple'] == best_match].iloc[0].to_dict()
        return {
            "match_type": "fuzzy",
            "matched_name": best_match,
            "registration_no": fuzzy_row["registration_no"],
            "registry_type": fuzzy_row["registry_type"],
            "confidence": score,
            "days_to_expiry": fuzzy_row.get("days_to_expiry", None)
        }

    # 3. No match
    return {
        "match_type": "none",
        "matched_name": None,
        "registration_no": None,
        "registry_type": None,
        "confidence": 0,
        "days_to_expiry": None
    }

# -----------------------
# Apply matching to candidate names
# -----------------------
results = []
for cand in df_msg['candidate_name_norm_simple'].dropna().unique():
    res = match_candidate(cand, registry_names, df_reg)
    res["candidate_name_norm_simple"] = cand

    # status assignment
    if res["match_type"] == "none":
        res["status"] = "unverified"
    else:
        if pd.isna(res["days_to_expiry"]):
            res["status"] = "active"
        elif res["days_to_expiry"] < 0:
            res["status"] = "expired"
        else:
            res["status"] = "active"
    results.append(res)

df_candidates = pd.DataFrame(results)

# -----------------------
# Merge back into messages
# -----------------------
df_msg = df_msg.merge(
    df_candidates[["candidate_name_norm_simple","match_type","matched_name","registration_no","registry_type","confidence","days_to_expiry","status"]],
    on="candidate_name_norm_simple",
    how="left"
)

# -----------------------
# Save results
# -----------------------
df_msg.to_csv(output_messages_path, index=False)
df_candidates.to_csv(output_candidates_path, index=False)

print(f"✅ Saved message-level verification: {output_messages_path}")
print(f"✅ Saved candidate-level summary: {output_candidates_path}")

print("\nSample candidate verification:")
print(df_candidates.head(15))


✅ Saved message-level verification: D:\Darryl\Coding\s_p\data\processed\messages_with_verification.csv
✅ Saved candidate-level summary: D:\Darryl\Coding\s_p\data\processed\candidates_verification_summary.csv

Sample candidate verification:
   match_type                      matched_name registration_no registry_type  \
0        none                              None            None          None   
1        none                              None            None          None   
2        none                              None            None          None   
3        none                              None            None          None   
4        none                              None            None          None   
5       exact                          equity99    INA000005358            IA   
6        none                              None            None          None   
7        none                              None            None          None   
8        none                  