In [120]:
# %%
import os
os.environ["OPENAI_API_KEY"] = "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"

In [121]:
# %%

import logging
import pandas as pd
from pathlib import Path
from datetime import datetime
from typing import Optional, Tuple, Dict, List
from difflib import SequenceMatcher
from itertools import combinations
from openai import OpenAI
client = OpenAI()

# Enable logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Only-used constants
CONNECT_TIER = {"A": "3", "I": "2", "U": "2", "": "1"}

In [122]:
# %%
# -----------------------------------------------------------------------------
# STEP 1 – Ingestion & Validation
# -----------------------------------------------------------------------------
def load_contacts(file_path: str | Path) -> pd.DataFrame:
    path = Path(file_path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")
    required = [
        "Account Name", "Full Name", "Email", "Contact Id",
        "Admin Role", "Primary Contact", "Active Contact",
        "ConnectLink Status", "Connect Link Email",
        "# of cases", "# of opps", "Last Activity", "Created Date"
    ]
    df = pd.read_excel(path, engine="openpyxl", parse_dates=["Last Activity","Created Date"])
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")
    df = df[required]
    # fill blanks
    df = df.fillna({c: "" for c in df.select_dtypes(include=["object","string"]).columns})
    df["Primary Contact"] = df["Primary Contact"].fillna(False)
    # trim whitespace
    for c in ["Account Name","Full Name","Email","Connect Link Email"]:
        df[c] = df[c].astype(str).str.strip().str.replace(r"\s+"," ",regex=True)
    return df

In [123]:
# %%
# -----------------------------------------------------------------------------
# STEP 2 – Hierarchy Tag
# -----------------------------------------------------------------------------
def add_comparison_tag(df: pd.DataFrame, today: Optional[datetime]=None) -> pd.DataFrame:
    if today is None:
        today = pd.Timestamp.today().normalize()
    df = df.copy()
    df["is_privileged"] = df["Admin Role"].str.lower().str.strip().isin({"owner","admin"})
    df["primary_bit"]   = df["Primary Contact"].astype(bool).astype(int)
    df["active_bit"]    = (df["Active Contact"].str.lower().str.strip()=="active").astype(int)
    df["connect_tier"]  = df["ConnectLink Status"].str.upper().str.strip().map(CONNECT_TIER).fillna("1")
    opps = df["# of opps"].fillna(0).astype(int)
    df["opps_bucket"]   = pd.cut(opps,[-1,0,3,float("inf")],labels=["Z","L","H"]).astype(str)
    days = (today - df["Last Activity"]).dt.days
    df["activity_tier"] = pd.cut(days,[-float("inf"),365,912,float("inf")],labels=["1","2","3"]).astype(str)
    df.loc[days.isna(),"activity_tier"]="4"
    demote = (df["ConnectLink Status"]=="U") & (df["opps_bucket"]=="Z") & (df["activity_tier"].isin({"3","4"}))
    df.loc[demote,"connect_tier"]="1"
    df["email_bit"]     = df["Email"].astype(str).str.strip().ne("").astype(int)
    crdays = (today - df["Created Date"]).dt.days.fillna(0).clip(0,99999).astype(int)
    df["created_rank"]  = crdays.astype(str).str.zfill(5)
    df["hier_tag"]      = (
        df["primary_bit"].astype(str)+"|"+
        df["active_bit"].astype(str)+"|"+
        df["connect_tier"]+"|"+
        df["opps_bucket"]+"|"+
        df["activity_tier"]+"|"+
        df["email_bit"].astype(str)+"|"+
        df["created_rank"]
    )
    df.loc[df["is_privileged"],"hier_tag"]="PRIV"
    return df.drop(columns=["primary_bit","active_bit","connect_tier","opps_bucket","activity_tier","email_bit","created_rank"])




In [124]:
# -----------------------------------------------------------------------------
# Helpers for clustering
# -----------------------------------------------------------------------------
def split_email(e: str) -> Tuple[str,str]:
    if "@ " not in e and "@" not in e:
        return e, ""
    return e.split("@",1)  # local,domain

def _prep_normalised_fields(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["email_norm"] = df["Email"].str.lower().str.strip()
    df["name_norm"]  = (
        df["Full Name"].str.lower().str.replace(r"[^a-z ]","",regex=True).str.strip()
    )
    df["sfi_key"]    = df["name_norm"].apply(lambda s: f"{s.split()[-1]}_{s[0]}" if s else "")
    df["name_prefix"]= df["name_norm"].str[:2]
    return df

class UnionFind:
    def __init__(self):
        self.parent,self.rank={},{}
    def find(self,x):
        p=self.parent.get(x,x)
        if p!=x: self.parent[x]=self.find(p)
        return self.parent.get(x,x)
    def union(self,a,b):
        ra,rb=self.find(a),self.find(b)
        if ra==rb: return
        if self.rank.get(ra,0)<self.rank.get(rb,0):
            self.parent[ra]=rb
        else:
            self.parent[rb]=ra
            if self.rank.get(ra,0)==self.rank.get(rb,0):
                self.rank[ra]=self.rank.get(ra,0)+1

In [125]:
# -----------------------------------------------------------------------------
# STEP 4 – Duplicate-Candidate Generation
# -----------------------------------------------------------------------------
def add_duplicate_cluster_ids(
    df_in: pd.DataFrame,
    name_sim_threshold: int = 95,
    email_edit_distance: int = 1
) -> pd.DataFrame:
    df = _prep_normalised_fields(df_in)
    uf = UnionFind()

    for acct, grp in df.groupby("Account Name"):
        ids = grp.index.tolist()
        # exact email
        for _,blk in grp.groupby("email_norm"):
            idxs=blk.index.tolist()
            for i in idxs[1:]: uf.union(idxs[0],i)
        # surname+initial
        for _,blk in grp.groupby("sfi_key"):
            idxs=blk.index.tolist()
            for i in idxs[1:]: uf.union(idxs[0],i)
        # domain‑anchored fuzzy email
        for dom,sub in grp.groupby(grp["email_norm"].str.split("@").str[1].fillna("")):
            idxs=sub.index.tolist()
            for i,j in combinations(idxs,2):
                li,di=split_email(df.at[i,"email_norm"])
                lj,dj=split_email(df.at[j,"email_norm"])
                if di==dj and distance.Levenshtein.distance(li,lj)<=email_edit_distance:
                    uf.union(i,j)
        # name_prefix blocking for fuzzy-name
        for _,blk in grp.groupby("name_prefix"):
            idxs=blk.index.tolist()
            for i,j in combinations(idxs,2):
                if uf.find(i)==uf.find(j): continue
                n1,n2=df.at[i,"name_norm"],df.at[j,"name_norm"]
                if abs(len(n1)-len(n2))<=2 and fuzz.token_sort_ratio(n1,n2)>=name_sim_threshold:
                    uf.union(i,j)

    # build cluster ids on same DataFrame
    roots,clusters={},[]
    cnt=1
    for i in df.index:
        r=uf.find(i)
        if r not in roots:
            roots[r]=f"C{cnt:05d}"; cnt+=1
        clusters.append(roots[r])
    df_in["dupe_cluster_id"]=clusters
    return df_in


# %%
def assign_canonical_records(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    Return a copy of df_in with canonical/merge annotations.
    Fixed: initialize columns before any reference to them.
    """
    df = df_in.copy()
    
    # 1) Initialize all three columns up front
    df["is_canonical"]         = False
    df["canonical_contact_id"] = None
    df["resolution_status"]    = None
    
    # 2) Helper to pick the earliest Created Date
    def _pick_primary(rows: pd.DataFrame) -> pd.Series:
        return rows.sort_values("Created Date").iloc[0]
    
    # 3) Cluster‐by‐cluster logic
    for cid, idxs in df.groupby("dupe_cluster_id").groups.items():
        sub = df.loc[idxs]
        # 3a) Single row → keep as-is
        if len(sub) == 1:
            i = sub.index[0]
            df.at[i, "is_canonical"]         = True
            df.at[i, "canonical_contact_id"] = df.at[i, "Contact Id"]
            df.at[i, "resolution_status"]    = "single_record"
            continue
        
        # 3b) Privileged siphon
        priv = sub[sub["is_privileged"]]
        nonp = sub[~sub["is_privileged"]]
        if not priv.empty:
            # pick primary privileged by Created Date
            primary = _pick_primary(priv)
            pid = primary["Contact Id"]
            
            # mark all privileged as kept
            df.loc[priv.index, "is_canonical"]         = True
            df.loc[priv.index, "canonical_contact_id"] = priv["Contact Id"]
            df.loc[priv.index, "resolution_status"]    = "keep_privileged"
            
            # merge everyone else into that primary
            df.loc[nonp.index, "canonical_contact_id"] = pid
            df.loc[nonp.index, "resolution_status"]    = "merge_into_privileged"
            continue
        
        # 3c) No privileged: hierarchy‐tag winner
        sorted_sub = sub.sort_values("hier_tag", ascending=False)
        top_tag    = sorted_sub.iloc[0]["hier_tag"]
        tied       = sorted_sub[sorted_sub["hier_tag"] == top_tag]
        
        if len(tied) == 1:
            # clear winner
            win = tied.index[0]
            cid_win = tied.iloc[0]["Contact Id"]
            
            df.at[win, "is_canonical"]         = True
            df.at[win, "canonical_contact_id"] = cid_win
            df.at[win, "resolution_status"]    = "keep"
            
            losers = sorted_sub.index.difference([win])
            df.loc[losers, "canonical_contact_id"] = cid_win
            df.loc[losers, "resolution_status"]    = "merge"
        else:
            # tie → needs manual review
            first_cid = tied.iloc[0]["Contact Id"]
            
            df.loc[tied.index, "is_canonical"]         = True
            df.loc[tied.index, "canonical_contact_id"] = tied["Contact Id"]
            df.loc[tied.index, "resolution_status"]    = "needs_review"
            
            rest = sorted_sub.index.difference(tied.index)
            df.loc[rest, "canonical_contact_id"] = first_cid
            df.loc[rest, "resolution_status"]    = "merge"
    
    return df





In [126]:
# %%
# ------------------------------------------------------------------------
# Helper: Levenshtein edit distance (classic DP implementation)
# ------------------------------------------------------------------------
def levenshtein(s: str, t: str) -> int:
    m, n = len(s), len(t)
    if m < n:
        return levenshtein(t, s)
    if n == 0:
        return m
    prev = list(range(n + 1))
    for i, sc in enumerate(s, start=1):
        curr = [i] + [0]*n
        for j, tc in enumerate(t, start=1):
            insert = curr[j-1] + 1
            delete = prev[j] + 1
            replace = prev[j-1] + (sc != tc)
            curr[j] = min(insert, delete, replace)
        prev = curr
    return prev[n]


# %%
def apply_email_merge_or_inactivate(df: pd.DataFrame,
                                    max_email_dist: int = 1) -> pd.DataFrame:
    """
    Merge non-canonical rows if:
      • Same domain
      • Local-part edit distance ≤1
      • And if distance ==1, the changed character is NOT at pos 0 or 1.
    Otherwise mark as inactive.
    """
    df = df.copy()
    # Normalize email if not already present
    df["email_norm"] = df["Email"].astype(str).str.lower().str.strip()

    # Reset prior statuses for non-canonical rows
    mask_nc = ~df["is_canonical"].fillna(False)
    df.loc[mask_nc, ["resolution_status", "canonical_contact_id"]] = [None, None]

    # Build lookup of canonical records by cluster
    can_lookup: Dict[str, List[Tuple[int,str,int,str,str]]] = {}
    for cid, sub in df[df["is_canonical"]].groupby("dupe_cluster_id"):
        can_lookup[cid] = [
            (idx,
             sub.at[idx, "email_norm"],
             sub.at[idx, "hier_tag"],
             sub.at[idx, "Contact Id"],
             sub.at[idx, "resolution_status"])
            for idx in sub.index
        ]

    # Helper to split into local, domain
    def split_email(e: str) -> Tuple[str,str]:
        if "@" not in e:
            return e, ""
        return e.split("@", 1)

    # Process each non-canonical row
    for idx, row in df[mask_nc].iterrows():
        my_local, my_dom = split_email(row["email_norm"] or "")
        best_match = None  # to hold (can_idx, can_tag, can_cid, can_stat)

        # Examine each canonical in same cluster
        for can_idx, can_email, can_tag, can_cid, can_stat in can_lookup.get(row["dupe_cluster_id"], []):
            can_local, can_dom = split_email(can_email)
            # Only consider merge candidates with exact domain match
            if my_dom != can_dom:
                continue

            # Compute edit distance
            dist = levenshtein(my_local, can_local)
            if dist > max_email_dist:
                continue  # too many errors → skip

            # If exactly one edit, ensure it's not at initial positions
            if dist == 1 and len(my_local) == len(can_local):
                # find substitution index
                subs = [i for i, (a, b) in enumerate(zip(my_local, can_local)) if a != b]
                if subs and subs[0] in (0, 1):
                    continue  # error at pos 0/1 → treat as inactive (skip)

            # Passed all checks → potential merge target
            if best_match is None or can_tag > best_match[1]:
                best_match = (can_idx, can_tag, can_cid, can_stat)

        # Apply merge or inactive
        if best_match:
            _, _, target_cid, target_stat = best_match
            df.at[idx, "canonical_contact_id"] = target_cid
            df.at[idx, "resolution_status"] = (
                "merge_into_privileged"
                if target_stat == "keep_privileged" else
                "merge"
            )
        else:
            df.at[idx, "resolution_status"] = "inactive"
            df.at[idx, "canonical_contact_id"] = None

    return df




In [127]:
# %%
# -----------------------------------------------------------------------------
# STEP 6 – Export Results
# -----------------------------------------------------------------------------
from pathlib import Path
import pandas as pd

def export_dedupe_results(df: pd.DataFrame, out_path: str | Path = "output/deduped.xlsx"):
    path = Path(out_path)
    path.parent.mkdir(parents=True, exist_ok=True)

    # 1) Master sheet – ready for CRM import
    master_cols = [
        "Account Name", "Full Name", "Email", "Contact Id",
        "canonical_contact_id", "resolution_status",
        "dupe_cluster_id", "is_canonical", "hier_tag"
    ]
    master = df[master_cols].sort_values(
        by=["dupe_cluster_id", "is_canonical"],
        ascending=[True, False]
    )

    # 2) Change‑log – one row per merged record
    merge_mask = df["resolution_status"].isin({"merge", "merge_into_privileged"})
    change = (
        df.loc[merge_mask, ["dupe_cluster_id", "Contact Id",
                            "canonical_contact_id", "resolution_status", "hier_tag"]]
          .rename(columns={"Contact Id": "old_contact_id"})
    )

    # 3) Needs‑review sheet
    review = df[df["resolution_status"] == "needs_review"][master_cols]

    # 4) Write to Excel
    with pd.ExcelWriter(path, engine="openpyxl") as xl:
        master.to_excel(xl, sheet_name="master_contacts", index=False)
        change.to_excel(xl, sheet_name="change_log",     index=False)
        review.to_excel(xl, sheet_name="needs_review",   index=False)

    logger.info("Wrote dedupe workbook to %s", path.resolve())


In [128]:
# %%
from openai import OpenAI, OpenAIError
import logging

client = OpenAI()
logger = logging.getLogger(__name__)

def review_contacts_with_llm(df: pd.DataFrame,
                             sample_size: int = 10,
                             name_sim_th: int = 95,
                             email_dist_th: int = 1) -> str:
    """
    Ask the LLM to scan a few sample rows plus your rules for edge cases.
    Falls back to an empty string if we’re rate‑limited or out of quota.
    """
    # … build samples, system, user as before …
    samples = df.sample(min(sample_size, len(df))).to_dict("records")
    examples_text = "\n".join(
        f"- Account: {r['Account Name']}, Name: {r['Full Name']}, Email: {r['Email']}, "
        f"Cluster: {r.get('dupe_cluster_id','?')}, Status: {r.get('resolution_status','?')}"
        for r in samples
    )
    system = (
        "You are a data engineer reviewing a Python deduplication pipeline. "
        f"Rules: name-sim ≥ {name_sim_th}, email-dist ≤ {email_dist_th}. "
        "Identify potential misclassifications or edge cases."
    )
    user = f"Here are sample records:\n{examples_text}\n\nWhat issues do you see?"

    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user",   "content": user}
            ],
            temperature=0.2,
            max_tokens=500
        )
        return resp.choices[0].message.content

    except OpenAIError as e:
        # catch RateLimitError, InsufficientQuota, etc.
        logger.warning("LLM review skipped: %s", str(e))
        return ""  # or return some default note



In [129]:
# %%
# ── Cell 2: main() definition ──
def main():
    df = load_contacts("../data/duplicate_contacts_small.xlsx")
    df = add_comparison_tag(df)
    df = add_duplicate_cluster_ids(df)
    df = assign_canonical_records(df)
    df = apply_email_merge_or_inactivate(df)

    # LLM review
    feedback = review_contacts_with_llm(df, sample_size=8)
    print("\n=== LLM REVIEW ===\n", feedback)

    export_dedupe_results(df, "output/deduped_contacts.xlsx")

logging.basicConfig(level=logging.INFO)


# %%
main()


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /chat/completions in 0.454922 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /chat/completions in 0.783267 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:__main__:Wrote dedupe workbook to C:\Users\Elioa\OneDrive\Projects\data-dedup-pilot\notebooks\output\deduped_contacts.xlsx



=== LLM REVIEW ===
 
