In [82]:
# %% Cell 0 – Set API key for OpenAI calls
import os
# Store your OpenAI secret so the client can pick it up
os.environ["OPENAI_API_KEY"] = "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"


In [83]:


# %% Cell 1 – Imports, constants, and client setup
import logging                              # Standard Python logging
import pandas as pd                         # DataFrames & Excel I/O
from pathlib import Path                    # Filesystem paths
from datetime import datetime               # Working with dates
from typing import Optional, Tuple, Dict, List
from difflib import SequenceMatcher         # Name‐similarity metric
from itertools import combinations          # Pairwise loops for clustering
from openai import OpenAI, OpenAIError      # OpenAI v1 client & errors
from rapidfuzz import fuzz, distance

# Instantiate the OpenAI client (will read OPENAI_API_KEY)
client = OpenAI()

# Turn on INFO‑level logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Map ConnectLink statuses to sorting tiers (string so tags sort lexicographically)
CONNECT_TIER = {"A": "3", "I": "2", "U": "2", "": "1"}

In [106]:
# %% Cell 1 – Ingestion for your exact headers
import pandas as pd
from pathlib import Path

def load_contacts(file_path: str | Path) -> pd.DataFrame:
    """
    1) Read the sheet as-is.
    2) Rename exactly your 14 incoming headers to the 12 the pipeline expects.
    3) Synthesize Full Name from First + Last Name.
    4) Validate the required set.
    5) Normalize only those required fields; preserve all others.
    """
    path = Path(file_path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    # 1) Read everything (dates as strings for now)
    df = pd.read_excel(path, engine="openpyxl", dtype=str)

    # 2) Exact rename map
    rename_map = {
        "Account Name: Acct_ID_18": "Account Name",
        "Contact_id_18":            "Contact Id",
        "Primary Contact Any":      "Primary Contact",
        "Agile Contact Email":      "Connect Link Email",
        "# of Cases":               "# of cases",
        "# of Opps":                "# of opps"
        # Leave "First Name","Last Name","Email","Admin Role","Active Contact",
        # "ConnectLink Status","Last Activity","Created Date" unchanged.
    }
    df = df.rename(columns=rename_map)

    # 3) Build Full Name if missing
    if "Full Name" not in df.columns:
        if {"First Name","Last Name"}.issubset(df.columns):
            df["Full Name"] = (
                df["First Name"].fillna("").str.strip() + " " +
                df["Last Name"].fillna("").str.strip()
            ).str.strip()
        else:
            raise ValueError("Missing both Full Name and First Name + Last Name")

    # 4) Coerce the two date columns, if present
    for dt in ["Last Activity","Created Date"]:
        if dt in df.columns:
            df[dt] = pd.to_datetime(df[dt], errors="coerce")

    # 5) Validate the canonical 12 exist
    required = [
        "Account Name", "Full Name", "Email", "Contact Id",
        "Admin Role", "Primary Contact", "Active Contact",
        "ConnectLink Status", "Connect Link Email",
        "# of cases", "# of opps", "Last Activity", "Created Date"
    ]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    # 6) Normalize only those 12:
    #    - Fill text→"" and trim spaces
    #    - Map Primary Contact → boolean
    for c in required:
        if df[c].dtype == object:
            df[c] = (
                df[c]
                  .fillna("")
                  .astype(str)
                  .str.strip()
                  .str.replace(r"\s+", " ", regex=True)
            )
    df["Primary Contact"] = df["Primary Contact"].map(
        lambda x: str(x).lower() in {"true","1","yes"}
    )

    return df


In [107]:
# %% Cell 2 – Revised STEP 2: build hier_tag with combined email bit, blank‐dates=best
import pandas as pd
from datetime import datetime

def add_comparison_tag(df: pd.DataFrame, today: datetime|None=None) -> pd.DataFrame:
    """
    Builds:
      • is_privileged (owner/admin)
      • hier_tag: primary|active|connect|opps|activity|email|created
    Notes:
      - activity tier treats blank Last Activity as most‐recent (tier “1”)
      - email bit = 1 if either Email or Connect Link Email is present
    """
    if today is None:
        today = pd.Timestamp.today().normalize()
    df = df.copy()

    # ensure dates are datetime
    df["Last Activity"] = pd.to_datetime(df["Last Activity"], errors="coerce")
    df["Created Date"]  = pd.to_datetime(df["Created Date"],  errors="coerce")

    # privileged
    df["is_privileged"] = df["Admin Role"].str.lower().str.strip().isin({"owner","admin"})

    # bits
    df["primary_bit"] = df["Primary Contact"].astype(bool).astype(int)
    df["active_bit"]  = (df["Active Contact"].str.lower().str.strip() == "active").astype(int)
    df["connect_tier"]= df["ConnectLink Status"].str.upper().str.strip().map(CONNECT_TIER).fillna("1")

    # opps bucket
    opps = df["# of opps"].fillna(0).astype(int)
    df["opps_bucket"] = pd.cut(opps, [-1,0,3,float("inf")], labels=["Z","L","H"]).astype(str)

    # activity tier (blank→tier 1)
    days = (today - df["Last Activity"]).dt.days
    df["activity_tier"] = pd.cut(days, [-float("inf"),365,912,float("inf")], labels=["1","2","3"]).astype(str)
    df.loc[days.isna(), "activity_tier"] = "1"

    # U-tier demotion
    mask_demote = (
        (df["ConnectLink Status"].str.upper()=="U") &
        (df["opps_bucket"]=="Z") &
        (df["activity_tier"].isin({"3"}))
    )
    df.loc[mask_demote, "connect_tier"] = "1"

    # combined email presence bit
    df["email_bit"] = (
        df["Email"].astype(str).str.strip().ne("") |
        df["Connect Link Email"].astype(str).str.strip().ne("")
    ).astype(int)

    # created rank
    crdays = (today - df["Created Date"]).dt.days.fillna(0).clip(0,99999).astype(int)
    df["created_rank"] = crdays.astype(str).str.zfill(5)

    # compose tag
    df["hier_tag"] = (
        df["primary_bit"].astype(str)+"|"+
        df["active_bit"].astype(str)+"|"+
        df["connect_tier"]+"|"+
        df["opps_bucket"]+"|"+
        df["activity_tier"]+"|"+
        df["email_bit"].astype(str)+"|"+
        df["created_rank"]
    )

    # privileged override
    df.loc[df["is_privileged"], "hier_tag"] = "PRIV"

    return df.drop(columns=[
        "primary_bit","active_bit","connect_tier",
        "opps_bucket","activity_tier","email_bit","created_rank"
    ])


In [110]:
# %% Cell X – Helpers for clustering (override your old _prep_normalised_fields)
import pandas as pd

def _prep_normalised_fields(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add exactly these helper columns:
      • email_norm   – primary Email lower/trimmed
      • connect_norm – Agile/ConnectLink Email lower/trimmed
      • name_norm    – Full Name lower, letters+spaces only
      • sfi_key      – surname + '_' + first initial
    """
    df = df.copy()
    # 1) primary email normalized
    df["email_norm"]   = df["Email"].astype(str).str.lower().str.strip()
    # 2) agile/contact‐link email normalized
    df["connect_norm"] = df["Connect Link Email"].astype(str).str.lower().str.strip()
    # 3) full name cleaned
    df["name_norm"]    = (
        df["Full Name"]
          .astype(str)
          .str.lower()
          .str.replace(r"[^a-z ]","",regex=True)
          .str.strip()
    )
    # 4) surname+initial blocking key
    def make_sfi(s: str) -> str:
        parts = s.split()
        return f"{parts[-1]}_{parts[0][0]}" if len(parts) >= 2 else ""
    df["sfi_key"]      = df["name_norm"].apply(make_sfi)
    return df



In [None]:
# %% Cell Y – Revised STEP 4: duplicate clustering in your exact order
import pandas as pd
from rapidfuzz import fuzz, distance
from itertools import combinations

def add_duplicate_cluster_ids(
    df_in: pd.DataFrame,
    name_sim_threshold: int = 95,
    email_edit_distance: int = 1
) -> pd.DataFrame:
    """
    Block & fuzzy‐match in this sequence:
      1) exact primary email
      2) exact agile email
      3) exact full name
      4) surname‐first‐initial
      5) one‐char off local‐parts (same domain)
      6) fuzzy name (token_sort_ratio)
    """
    df = _prep_normalised_fields(df_in)
    uf = UnionFind()

    for acct, grp in df.groupby("Account Name"):
        idxs = list(grp.index)

        # 1) exact primary email
        for _, block in grp.groupby("email_norm"):
            ids = list(block.index)
            for i in ids[1:]:
                uf.union(ids[0], i)

        # 2) exact agile email
        for _, block in grp.groupby("connect_norm"):
            ids = list(block.index)
            for i in ids[1:]:
                uf.union(ids[0], i)

        # 3) exact full name
        for _, block in grp.groupby("name_norm"):
            ids = list(block.index)
            for i in ids[1:]:
                uf.union(ids[0], i)

        # 4) surname‐first‐initial
        for _, block in grp.groupby("sfi_key"):
            ids = list(block.index)
            for i in ids[1:]:
                uf.union(ids[0], i)

        # 5) one‐char off email local‐part
        for domain, sub in grp.groupby(grp["email_norm"].str.split("@").str[1].fillna("")):
            ids = list(sub.index)
            for i, j in combinations(ids, 2):
                local_i = df.at[i, "email_norm"].split("@")[0]
                local_j = df.at[j, "email_norm"].split("@")[0]
                if distance.Levenshtein.distance(local_i, local_j) <= email_edit_distance:
                    uf.union(i, j)

        # 6) fuzzy full name
        for i, j in combinations(idxs, 2):
            if uf.find(i) == uf.find(j):
                continue
            n1, n2 = df.at[i, "name_norm"], df.at[j, "name_norm"]
            if fuzz.token_sort_ratio(n1, n2) >= name_sim_threshold:
                uf.union(i, j)

    # assign stable cluster IDs
    root2cid = {}
    clusters = []
    counter = 1
    for i in df.index:
        root = uf.find(i)
        if root not in root2cid:
            root2cid[root] = f"C{counter:05d}"
            counter += 1
        clusters.append(root2cid[root])

    df_in["dupe_cluster_id"] = clusters
    return df_in

In [150]:
def assign_canonical_records(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    PASS 1: same name_norm + >1 distinct email_norm → keep only owners, inactivate others
    PASS 2: existing dupe_cluster_id logic (email‐dedup, privileged, singleton, hier_tag)
    """
    df = df_in.copy()
    # normalize emails & names
    df["email_norm"] = (
        df["Email"].astype(str)
                .str.lower()
                .str.strip()
    )
    # ensure name_norm exists
    df["name_norm"] = (
        df["Full Name"].astype(str)
                      .str.lower()
                      .str.replace(r"[^a-z ]", "", regex=True)
                      .str.strip()
    )
    # init output cols
    df["is_canonical"]         = False
    df["canonical_contact_id"] = None
    df["resolution_status"]    = None

    # -----------------------------
    # PASS 1: same-name, different-email
    # -----------------------------
    for acct, acct_idxs in df.groupby("Account Name").groups.items():
        subset = df.loc[acct_idxs]
        for nm, name_idxs in subset.groupby("name_norm").groups.items():
            if len(name_idxs) <= 1:
                continue
            distinct_emails = df.loc[name_idxs, "email_norm"].nunique()
            if distinct_emails > 1:
                # find privileged rows for this name
                priv = df.loc[name_idxs][df.loc[name_idxs, "is_privileged"]].index.tolist()
                nonpriv = [i for i in name_idxs if i not in priv]
                # keep privileged
                for i in priv:
                    df.at[i, "is_canonical"] = True
                    df.at[i, "canonical_contact_id"] = df.at[i, "Contact Id"]
                    df.at[i, "resolution_status"] = "keep_privileged"
                # inactivate the rest
                for i in nonpriv:
                    df.at[i, "is_canonical"] = False
                    df.at[i, "canonical_contact_id"] = None
                    df.at[i, "resolution_status"] = "inactive"

    # helper for picking earliest created
    def pick_earliest(rows: pd.DataFrame) -> pd.Series:
        return rows.sort_values("Created Date").iloc[0]

    # -----------------------------
    # PASS 2: original dupe_cluster_id logic
    # -----------------------------
    for cluster_id, grp in df.groupby("dupe_cluster_id"):
        # only those not yet resolved
        remaining = grp[grp["resolution_status"].isna()]
        if remaining.empty:
            continue

        sub = remaining.copy()

        # 1) PRE‑PASS: dedupe non‑priv by email_norm
        non_priv = sub[~sub["is_privileged"]]
        for email, block in non_priv.groupby("email_norm"):
            if email and len(block) > 1:
                primary = pick_earliest(block)
                pid = primary["Contact Id"]

                # keep that one
                df.at[primary.name, "is_canonical"] = True
                df.at[primary.name, "canonical_contact_id"] = pid
                df.at[primary.name, "resolution_status"] = "keep_email_dedup"

                # merge the rest
                others = block.index.difference([primary.name])
                for i in others:
                    df.at[i, "is_canonical"] = False
                    df.at[i, "canonical_contact_id"] = pid
                    df.at[i, "resolution_status"] = "merge_email_dedup"

        # refresh remaining
        done = df.loc[grp.index][df.loc[grp.index, "resolution_status"].notna()].index
        to_do = grp.index.difference(done)
        if to_do.empty:
            continue
        sub = df.loc[to_do]

        # 2) Privileged siphon
        priv = sub[sub["is_privileged"]]
        if not priv.empty:
            primary = pick_earliest(priv)
            pid = primary["Contact Id"]

            # keep all privileged
            for i in priv.index:
                df.at[i, "is_canonical"] = True
                df.at[i, "canonical_contact_id"] = df.at[i, "Contact Id"]
                df.at[i, "resolution_status"] = "keep_privileged"

            # merge others into that earliest privileged
            others = sub.index.difference(priv.index)
            for i in others:
                df.at[i, "is_canonical"] = False
                df.at[i, "canonical_contact_id"] = pid
                df.at[i, "resolution_status"] = "merge_into_privileged"
            continue

        # 3) Singleton
        if len(sub) == 1:
            i = sub.index[0]
            df.at[i, "is_canonical"] = True
            df.at[i, "canonical_contact_id"] = df.at[i, "Contact Id"]
            df.at[i, "resolution_status"] = "single_record"
            continue

        # 4) hier_tag competition
        sorted_sub = sub.sort_values("hier_tag", ascending=False)
        top_tag = sorted_sub.iloc[0]["hier_tag"]
        tied    = sorted_sub[sorted_sub["hier_tag"] == top_tag]

        if len(tied) == 1:
            winner = tied.index[0]
            win_cid = tied.iloc[0]["Contact Id"]
            # keep winner
            df.at[winner, "is_canonical"] = True
            df.at[winner, "canonical_contact_id"] = win_cid
            df.at[winner, "resolution_status"] = "keep"
            # merge losers
            losers = sorted_sub.index.difference([winner])
            for i in losers:
                df.at[i, "is_canonical"] = False
                df.at[i, "canonical_contact_id"] = win_cid
                df.at[i, "resolution_status"] = "merge"
        else:
            # tie → keep all tied
            for i in tied.index:
                df.at[i, "is_canonical"] = True
                df.at[i, "canonical_contact_id"] = df.at[i, "Contact Id"]
                df.at[i, "resolution_status"] = "keep_tie"
            # merge the rest into the first tied
            first_cid = tied.iloc[0]["Contact Id"]
            rest = sorted_sub.index.difference(tied.index)
            for i in rest:
                df.at[i, "is_canonical"] = False
                df.at[i, "canonical_contact_id"] = first_cid
                df.at[i, "resolution_status"] = "merge"

    return df



In [151]:
# %% Cell 7 – Simplified STEP 5a: allow any one‑char email diffs
def apply_email_merge_or_inactivate(df: pd.DataFrame,
                                    max_email_dist: int = 1) -> pd.DataFrame:
    """
    Merge rules for non-canonical rows:
      1) blank email anywhere → wildcard merge
      2) exact match         → merge
      3) Levenshtein ≤ 1     → merge (no forbidden‐index checks)
      4) else                → inactive

    Owner/Admin (is_privileged) always stay canonical.
    """
    df = df.copy()

    # 0) Protect privileged rows
    priv_mask = df["is_privileged"].fillna(False)
    df.loc[priv_mask, "is_canonical"]         = True
    df.loc[priv_mask, "resolution_status"]    = "keep_privileged"
    df.loc[priv_mask, "canonical_contact_id"] = df.loc[priv_mask, "Contact Id"]

    # normalize email
    df["email_norm"] = df["Email"].astype(str).str.lower().str.strip()

    # target only non‑canonical, non‑privileged
    mask_nc = (~df["is_canonical"].fillna(False)) & (~priv_mask)
    df.loc[mask_nc, ["resolution_status","canonical_contact_id"]] = [None, None]

    # build lookup of canonicals per cluster
    can_lookup = {}
    for cid, sub in df[df["is_canonical"]].groupby("dupe_cluster_id"):
        can_lookup[cid] = [
            (idx,
             sub.at[idx, "email_norm"],
             sub.at[idx, "hier_tag"],
             sub.at[idx, "Contact Id"],
             sub.at[idx, "resolution_status"])
            for idx in sub.index
        ]

    # inline Levenshtein (same DP as before)
    def levenshtein(a: str, b: str) -> int:
        m, n = len(a), len(b)
        if m < n:
            return levenshtein(b, a)
        if n == 0:
            return m
        prev = list(range(n+1))
        for i, ca in enumerate(a, start=1):
            curr = [i] + [0]*n
            for j, cb in enumerate(b, start=1):
                ins = curr[j-1] + 1
                dele= prev[j]   + 1
                rep = prev[j-1] + (ca != cb)
                curr[j] = min(ins, dele, rep)
            prev = curr
        return prev[n]

    # process each non‑canonical row
    for idx, row in df[mask_nc].iterrows():
        me = row["email_norm"] or ""
        candidates = can_lookup.get(row["dupe_cluster_id"], [])
        best = None

        # CASE 1: blank wildcard
        if me == "":
            best = max(candidates, key=lambda x: x[2], default=None)
        else:
            for can_idx, ce, tag, cid_val, stat in candidates:
                # wildcard if canonical blank
                if ce == "":
                    best = (can_idx, ce, tag, cid_val, stat)
                    break
                # exact match
                if me == ce:
                    best = (can_idx, ce, tag, cid_val, stat)
                    break
                # one‐char off anywhere
                if levenshtein(me, ce) <= max_email_dist:
                    best = (can_idx, ce, tag, cid_val, stat)
                    break

        # apply merge or inactive
        if best:
            _, _, _, tgt_cid, tgt_stat = best
            df.at[idx, "canonical_contact_id"] = tgt_cid
            df.at[idx, "resolution_status"]    = (
                "merge_into_privileged" if tgt_stat == "keep_privileged" else "merge"
            )
        else:
            df.at[idx, "canonical_contact_id"] = None
            df.at[idx, "resolution_status"]    = "inactive"

    return df


In [152]:
# -----------------------------------------------------------------------------
# STEP 6 – Export Results (preserve all original columns + add dedupe columns)
# -----------------------------------------------------------------------------
from pathlib import Path
import pandas as pd

def export_dedupe_results(df: pd.DataFrame,
                          out_path: str | Path = "output/deduped.xlsx"):
    """
    Write your full DataFrame (all original columns, in original order)
    plus the dedupe output fields appended at the end, to 'master_contacts',
    and also produce 'change_log' and 'needs_review' sheets.
    """
    path = Path(out_path)
    path.parent.mkdir(parents=True, exist_ok=True)

    # 1) Determine final master sheet column order:
    #    start with all original columns, then add the new ones
    original_cols = list(df.columns)  # preserves their current order
    dedupe_cols = [
        "canonical_contact_id",
        "resolution_status",
        "dupe_cluster_id",
        "is_canonical",
        "hier_tag"
    ]
    # only append those not already present
    final_master_cols = original_cols + [c for c in dedupe_cols if c not in original_cols]

    # 2) Build master sheet
    master_sheet = df[final_master_cols].sort_values(
        by=["dupe_cluster_id", "is_canonical"],
        ascending=[True, False]
    )

    # 3) Change‑log – one row per merged record
    merge_mask = df["resolution_status"].isin({"merge", "merge_into_privileged"})
    change_log = (
        df.loc[merge_mask, ["dupe_cluster_id", "Contact Id",
                            "canonical_contact_id", "resolution_status", "hier_tag"]]
          .rename(columns={"Contact Id": "old_contact_id"})
    )

    # 4) Needs‑review – only those flagged for manual check
    review_sheet = df[df["resolution_status"] == "needs_review"][final_master_cols]

    # 5) Write to Excel
    with pd.ExcelWriter(path, engine="openpyxl") as xl:
        master_sheet.to_excel(xl, sheet_name="master_contacts", index=False)
        change_log.to_excel(xl, sheet_name="change_log",     index=False)
        review_sheet.to_excel(xl, sheet_name="needs_review", index=False)

    logger.info("Wrote dedupe workbook to %s", path.resolve())


In [153]:
# %% Cell 9 – Invoke the pipeline
# Simply call `main()` to run all steps
def main():
    df = load_contacts("../data/Duplicate_Contact_Scrub.xlsx")
    df = add_comparison_tag(df)
    df = add_duplicate_cluster_ids(df)
    df = assign_canonical_records(df)
    df = apply_email_merge_or_inactivate(df)
    export_dedupe_results(df, "output/deduped_contacts.xlsx")


In [154]:
main()

INFO:__main__:Wrote dedupe workbook to C:\Users\Elioa\OneDrive\Projects\data-dedup-pilot\notebooks\output\deduped_contacts.xlsx
