## 第 1 步：导入依赖与工具函数

In [15]:
# ---------- 依赖库导入 ---------- #
import re, time
import pandas as pd
from unidecode import unidecode
from datasketch import MinHash, MinHashLSH
from joblib import Parallel, delayed
from rapidfuzz import fuzz, distance
import jellyfish
from tqdm import tqdm
import matplotlib.pyplot as plt



## 第 2 步：标准化函数 & 阻塞函数定义

In [3]:
# ---------- 数据标准化 ---------- #
def read_df(path):
    return pd.read_csv(path)

def normalize(txt: str) -> str:
    txt = unidecode(txt or "").lower()
    txt = re.sub(r"[^a-z0-9\s]", " ", txt)
    return " ".join(txt.split())

def standardize(df, col):
    tqdm.pandas(desc=f"Normalize {col}")
    df["norm"] = df[col].progress_apply(normalize)
    return df

# ---------- 阻塞方法 ---------- #
def minhash_2g(s):
    m = MinHash(num_perm=128)
    for g in [s[i:i+2] for i in range(len(s)-1)] or [s]:
        m.update(g.encode())
    return m

def block_key_B(s): return f"{s[0]}_{len(s)//4}"
def block_key_C(s): return f"{jellyfish.soundex(s)}_{s[:3]}"

def add_blocks(df):
    tqdm.pandas(desc="Blocking keys")
    df["mh"]   = df["norm"].progress_apply(minhash_2g)
    df["keyB"] = df["norm"].apply(block_key_B)
    df["keyC"] = df["norm"].apply(block_key_C)
    return df


## 第 3 步：构建候选对生成逻辑

In [4]:
# ---------- 候选对生成 ---------- #
def lsh_build(df, tag, thr=0.65):
    lsh = MinHashLSH(threshold=thr, num_perm=128)
    for i, row in df.iterrows():
        lsh.insert(f"{tag}_{i}", row.mh)
    return lsh

def pairs_from_lsh(query, ref, lsh, tag):
    out = []
    for qi, q in tqdm(query.iterrows(), total=len(query), desc=f"L‑query {tag}"):
        for k in lsh.query(q.mh):
            ri = int(k.split('_')[1])
            out.append((qi, ri))
    return out

def pairs_from_key(query, ref, col, tag):
    grp = ref.groupby(col)
    out = []
    for qi, q in tqdm(query.iterrows(), total=len(query), desc=f"Key‑query {tag}:{col}"):
        if q[col] in grp.groups:
            out.extend([(qi, ri) for ri in grp.get_group(q[col]).index])
    return out


## 第 4 步：特征构建 & Top-N 策略

In [5]:
# ---------- 特征构建函数 ---------- #
def build_feature_row(q, r):
    a, b = q.norm, r.norm
    tokA, tokB = set(a.split()), set(b.split())
    feat = dict(
        jw          = fuzz.WRatio(a, b) / 100,
        lev_ratio   = 1 - distance.Levenshtein.normalized_distance(a, b),
        token_set   = fuzz.token_set_ratio(a, b) / 100,
        short_prefix= int(a[:4] == b[:4] and abs(len(a)-len(b)) <= 3),
        soundex_eq  = int(jellyfish.soundex(a) == jellyfish.soundex(b)),
        jaccard     = len(tokA & tokB) / len(tokA | tokB or {""})
    )
    feat.update(id_left=q.ID, id_right=r.ID)
    return feat

def generate_pairs(query, ref, tag, top_n=100):
    lsh = lsh_build(ref, f"{tag}A")
    idx = set(pairs_from_lsh(query, ref, lsh, f"{tag}A") +
              pairs_from_key(query, ref, "keyB", f"{tag}B") +
              pairs_from_key(query, ref, "keyC", f"{tag}C"))
    
    grouped = {}
    for i, j in idx:
        grouped.setdefault(i, []).append(j)

    reduced_pairs = []
    for i, js in grouped.items():
        js_sorted = sorted(js, key=lambda j: fuzz.WRatio(query.loc[i].norm, ref.loc[j].norm), reverse=True)
        for j in js_sorted[:top_n]:
            reduced_pairs.append((i, j))

    feats = Parallel(n_jobs=-1)(
        delayed(build_feature_row)(query.loc[i], ref.loc[j])
        for i, j in tqdm(reduced_pairs, desc=f"Building {tag}")
    )
    return pd.DataFrame(feats)


## 第 5 步：数据标注与采样

In [6]:
# ---------- 标注与采样 ---------- #
def label(df):
    df["Y"] = (df.id_left == df.id_right).astype(int)
    return df

def downsample(df, r=4):
    pos = df[df.Y == 1]
    neg = df[df.Y == 0].sample(min(len(df[df.Y == 0]), r * len(pos)), random_state=42)
    return pd.concat([pos, neg]).sample(frac=1, random_state=42).reset_index(drop=True)


## 第 6 步：加载数据 + 特征提取 + 保存中间文件

In [7]:
# ---------- 加载并保存候选对 ---------- #
P = add_blocks(standardize(read_df("primary.csv"), "NAME"))
A = add_blocks(standardize(read_df("alternate.csv"), "NAME"))
T = add_blocks(standardize(read_df("test_01.csv"), "VARIANT"))

pairs_dedup = pd.concat([
    label(generate_pairs(P, P, "P-P")),
    label(generate_pairs(A, A, "A-A")),
    label(generate_pairs(A, P, "A-P"))
]).drop_duplicates()
pairs_map = label(generate_pairs(T, pd.concat([P, A]).reset_index(drop=True), "T-PA"))

# 保存中间文件，便于重复调参无需重跑前几步
pairs_dedup.to_csv("pairs_dedup.csv", index=False)
pairs_map.to_csv("pairs_map.csv", index=False)


Normalize NAME: 100%|██████████| 16041/16041 [00:00<00:00, 426300.21it/s]
Blocking keys: 100%|██████████| 16041/16041 [00:18<00:00, 866.40it/s]
Normalize NAME: 100%|██████████| 19525/19525 [00:00<00:00, 318656.58it/s]
Blocking keys: 100%|██████████| 19525/19525 [00:22<00:00, 863.96it/s]
Normalize VARIANT: 100%|██████████| 16041/16041 [00:00<00:00, 342141.87it/s]
Blocking keys: 100%|██████████| 16041/16041 [00:20<00:00, 778.60it/s]
L‑query P-PA: 100%|██████████| 16041/16041 [00:01<00:00, 11219.08it/s]
Key‑query P-PB:keyB: 100%|██████████| 16041/16041 [00:03<00:00, 4368.53it/s]
Key‑query P-PC:keyC: 100%|██████████| 16041/16041 [00:03<00:00, 4195.41it/s]
Building P-P: 100%|██████████| 1227137/1227137 [09:23<00:00, 2176.06it/s]
L‑query A-AA: 100%|██████████| 19525/19525 [00:01<00:00, 12105.68it/s]
Key‑query A-AB:keyB: 100%|██████████| 19525/19525 [00:04<00:00, 4070.39it/s]
Key‑query A-AC:keyC: 100%|██████████| 19525/19525 [00:03<00:00, 5913.22it/s]
Building A-A: 100%|██████████| 1560830/15

## 第 7 步：加载缓存文件 + 拟合模型（XGBoost）

In [None]:
# ---------- 加载训练数据 ---------- #
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support

pairs_dedup = pd.read_csv("pairs_dedup.csv")
pairs_map   = pd.read_csv("pairs_map.csv")

def downsample(df, r=4):
    pos = df[df.Y == 1]
    neg = df[df.Y == 0].sample(min(len(df[df.Y == 0]), r * len(pos)), random_state=42)
    return pd.concat([pos, neg]).sample(frac=1, random_state=42).reset_index(drop=True)

train_df = downsample(pairs_dedup, r=4)
X, y = train_df.drop(columns=["id_left", "id_right", "Y"]), train_df.Y
val_cut = int(0.2 * len(X))
Xv, yv = X.iloc[:val_cut], y.iloc[:val_cut]
Xt, yt = X.iloc[val_cut:], y.iloc[val_cut:]

# ---------- 模型训练 (Random Forest) ---------- #
clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,        
    class_weight='balanced_subsample',
    n_jobs=-1,
    random_state=42
).fit(Xt, yt)

# ---------- 模型预测 & 阈值评估 ---------- #
Xm, ym = pairs_map.drop(columns=["id_left", "id_right", "Y"]), pairs_map.Y
prob = clf.predict_proba(Xm)[:, 1]

# 多阈值尝试
for t in [0.8, 0.85, 0.89, 0.9, 0.91, 0.92, 0.95]:
    ypred = (prob >= t).astype(int)
    p, r, f, _ = precision_recall_fscore_support(ym, ypred, average='binary', zero_division=0)
    print(f"Threshold {t:.2f} → Precision {p:.3f} Recall {r:.3f} F1 {f:.3f}")


Threshold 0.80 → Precision 0.487 Recall 0.816 F1 0.610
Threshold 0.85 → Precision 0.637 Recall 0.774 F1 0.699
Threshold 0.89 → Precision 0.825 Recall 0.722 F1 0.770
Threshold 0.90 → Precision 0.864 Recall 0.712 F1 0.780
Threshold 0.91 → Precision 0.892 Recall 0.699 F1 0.784
Threshold 0.92 → Precision 0.924 Recall 0.680 F1 0.783
Threshold 0.95 → Precision 0.950 Recall 0.606 F1 0.740


## 第 8 步：测试评估指标输出

In [11]:
# ---------- 评估预测 ---------- #
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    roc_auc_score,
    average_precision_score
)

thr = 0.91
Xm, ym = pairs_map.drop(columns=["id_left", "id_right", "Y"]), pairs_map.Y
prob = clf.predict_proba(Xm)[:, 1]
ypred = (prob >= thr).astype(int)

p, r, f, _ = precision_recall_fscore_support(ym, ypred, average='binary')
acc = accuracy_score(ym, ypred)
auc = roc_auc_score(ym, prob)
pr_auc = average_precision_score(ym, prob)

print("\n=== Final Mapping Metrics ===")
print(f"Precision {p:.3f} Recall {r:.3f} F1 {f:.3f}")
print(f"Accuracy {acc:.3f} ROC-AUC {auc:.3f} PR-AUC {pr_auc:.3f}")



=== Final Mapping Metrics ===
Precision 0.910 Recall 0.697 F1 0.790
Accuracy 0.995 ROC-AUC 0.990 PR-AUC 0.815


## 第 9 步：Top-1 Accuracy 输出

In [14]:
# ---------- Top‑1 精度输出 ---------- #
pairs_map["prob"] = prob

# 获取每个测试样本（以 id_right 为单位）预测概率最高的候选对
top1 = pairs_map.sort_values("prob", ascending=False).groupby("id_right").head(1)

# Top‑1 Accuracy：每个测试样本的第一候选是否预测为 Y=1（即匹配成功）
top1_acc = top1.eval("Y == 1").mean()
print(f"\nTop‑1 Accuracy: {top1_acc:.3%}")

# 输出运行时间（确保 start 已定义）
# print(f"Total runtime: {time.time() - start:.1f} s")

# 输出分类报告（Precision, Recall, F1 按类别划分）
from sklearn.metrics import classification_report
print("\n=== Classification Report ===")
print(classification_report(ym, ypred, digits=3))



Top‑1 Accuracy: 84.055%

=== Classification Report ===
              precision    recall  f1-score   support

           0      0.996     0.999     0.997   1390884
           1      0.910     0.697     0.790     20336

    accuracy                          0.995   1411220
   macro avg      0.953     0.848     0.893   1411220
weighted avg      0.994     0.995     0.994   1411220

