In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
# Step 1: Load the data
df = pd.read_csv('dblp_names.csv', on_bad_lines='warn')
print(f"Initial records: {len(df)}")
df.head(10)

Initial records: 45156


Skipping line 281: expected 2 fields, saw 3
Skipping line 512: expected 2 fields, saw 3
Skipping line 839: expected 2 fields, saw 3
Skipping line 854: expected 2 fields, saw 3
Skipping line 1997: expected 2 fields, saw 3
Skipping line 2631: expected 2 fields, saw 3
Skipping line 2642: expected 2 fields, saw 3
Skipping line 2656: expected 2 fields, saw 3
Skipping line 2658: expected 2 fields, saw 3
Skipping line 2659: expected 2 fields, saw 3
Skipping line 6310: expected 2 fields, saw 3
Skipping line 10753: expected 2 fields, saw 3
Skipping line 11044: expected 2 fields, saw 3
Skipping line 18510: expected 2 fields, saw 3
Skipping line 31242: expected 2 fields, saw 3
Skipping line 41259: expected 2 fields, saw 3
Skipping line 41633: expected 2 fields, saw 3
Skipping line 43561: expected 2 fields, saw 3
Skipping line 43569: expected 2 fields, saw 3

  df = pd.read_csv('dblp_names.csv', on_bad_lines='warn')


Unnamed: 0,# OLD_NAME,NEW_NAME
0,Bamshad Mobasherm,Bamshad Mobasher
1,Carole Delporte-Gellet,Carole Delporte-Gallet
2,Ronald Presott Loui,Ronald Prescott Loui
3,Eve Riskin,Eve A. Riskin
4,Doouglas W. Clark,Douglas W. Clark
5,P. R. S. Visser,Pepijn R. S. Visser
6,Pepijn Visser,Pepijn R. S. Visser
7,A. Pietracaprina,Andrea Pietracaprina
8,M. R. Swanson,Mark R. Swanson
9,Matilde Celma-Giménez,Matilde Celma


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45156 entries, 0 to 45155
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   # OLD_NAME  45156 non-null  object
 1    NEW_NAME   45156 non-null  object
dtypes: object(2)
memory usage: 705.7+ KB


In [4]:
all_names = set(df['# OLD_NAME']) | set(df[' NEW_NAME'])
all_names

{'Jean-Pierre Dautricourt',
 'W. L. F. Degen',
 'A. Guénoche',
 'Sung-Eon Cho',
 'Tom Meitzler',
 'Tomasz Luczak',
 'Pavel Tichý',
 'Xuegang Chen',
 'Habib Mehrez',
 'Louis C. Quintas',
 'Manuel J. Maña-López',
 'W. Hesse',
 'Neal R. Harvey',
 'Won-Gyum Kim',
 'Baba Vemuri',
 'Susan McRoy',
 'Martha Crosby',
 'György Vaszil',
 'Debra A. Hope',
 'Saied Hosseini Khayat',
 'Gustovo Alonso',
 'Jonathan F. Bard',
 'Sverre Holm',
 'Chong-You Zheng',
 'Jürg Kohlhas',
 'Umesh Vazirani',
 'Bob Gerber',
 'Krishan K. Sanbani',
 'Zhang Longbo',
 'Tarantilis D. Tarantilis',
 'Mark G. Slack',
 'Lawrence B. Wolff',
 'Oleg T. Balovnev',
 'Paolo Bernardi',
 'Siand Wun Song',
 'Ugo De Carlini',
 'Samee Ullah Khan',
 'E. Okamoto',
 'Saleh Alshebeili',
 'C. Higgins',
 'Ozalp Babaoglu',
 'Trent Jaegar',
 'Tamás Horváth',
 'Arye Nehorai',
 'Keno Albrecht',
 'Chunkai Zhang',
 'Alexander E. Emanuel',
 'Wing-lok Yeung',
 'Laura Mota-Herranz',
 'Jane-Jing Liang',
 'Axel T. Brünger',
 'M. Mazo',
 'Matthew R. Rud

In [5]:
# --- Create name pairs from CSV ---
pairs = list(zip(df['# OLD_NAME'], df[' NEW_NAME']))
pairs[:10]

[('Bamshad Mobasherm', 'Bamshad Mobasher'),
 ('Carole Delporte-Gellet', 'Carole Delporte-Gallet'),
 ('Ronald Presott Loui', 'Ronald Prescott Loui'),
 ('Eve Riskin', 'Eve A. Riskin'),
 ('Doouglas W. Clark', 'Douglas W. Clark'),
 ('P. R. S. Visser', 'Pepijn R. S. Visser'),
 ('Pepijn Visser', 'Pepijn R. S. Visser'),
 ('A. Pietracaprina', 'Andrea Pietracaprina'),
 ('M. R. Swanson', 'Mark R. Swanson'),
 ('Matilde Celma-Giménez', 'Matilde Celma')]

In [6]:
print(f"Total unique mentions: {len(all_names)}")
print(f"Total known pairs: {len(pairs)}")

Total unique mentions: 80279
Total known pairs: 45156


In [7]:
# Strategy S1: Extract surname (last part after space)
def extract_surname(name):
    return name.strip().split()[-1]

# Strategy S2: Extract initials (first letter of each word, uppercased)
def extract_initials(name):
    return ''.join(part[0].upper() for part in name.strip().split())

In [8]:
# Extract surnames from names
surnames = [extract_surname(name) for name in all_names]

# Display a few samples
print("🔹 Sample extracted surnames:")
for surname in list(surnames)[:10]:  # Show only first 10
    print("-", surname)

🔹 Sample extracted surnames:
- Dautricourt
- Degen
- Guénoche
- Cho
- Meitzler
- Luczak
- Tichý
- Chen
- Mehrez
- Quintas


In [9]:
# Extract initials from names
initials = [extract_initials(name) for name in all_names]

# Display a few samples
print("🔹 Sample extracted initials:")
for initial in list(initials)[:10]:  # Show only first 10
    print("-", initial)

🔹 Sample extracted initials:
- JD
- WLFD
- AG
- SC
- TM
- TL
- PT
- XC
- HM
- LCQ


In [10]:
# Build blocks using a chosen strategy function
def build_blocks(names, strategy_func):
    blocks = defaultdict(list)
    for name in names:
        key = strategy_func(name)
        blocks[key].append(name)
    return blocks

# Count pairwise comparisons required within blocks
def count_pairwise_comparisons(blocks):
    return sum(len(block) * (len(block) - 1) // 2 for block in blocks.values())

# Compute recall: how many known pairs fall into same block
def compute_recall(pairs, blocks):
    block_lookup = defaultdict(set)
    for key, block_names in blocks.items():
        for name in block_names:
            block_lookup[name].add(key)

    same_block_count = sum(1 for a, b in pairs if block_lookup[a] & block_lookup[b])
    return same_block_count / len(pairs)

# Compute savings: how many pairwise comparisons are avoided
def compute_saving(num_mentions, blocked_comparisons):
    total_possible = num_mentions * (num_mentions - 1) / 2
    return 1 - (blocked_comparisons / total_possible)

In [11]:
# Apply Strategy S1
s1_blocks = build_blocks(all_names, extract_surname)
s1_comparisons = count_pairwise_comparisons(s1_blocks)
s1_recall = compute_recall(pairs, s1_blocks)
s1_saving = compute_saving(len(all_names), s1_comparisons)

print(f"🔷 Strategy S1: Surname Blocking")
print(f"Recall: {s1_recall:.4f}")
print(f"Saving: {s1_saving:.4f}")
s1_blocks

🔷 Strategy S1: Surname Blocking
Recall: 0.7632
Saving: 0.9996


defaultdict(list,
            {'Dautricourt': ['Jean-Pierre Dautricourt', 'J. P. Dautricourt'],
             'Degen': ['W. L. F. Degen', 'Wendelin L. F. Degen'],
             'Guénoche': ['A. Guénoche'],
             'Cho': ['Sung-Eon Cho',
              'Soen-Ku Cho',
              'Ikhwan Cho',
              'Ho-sik Cho',
              'Yong-Yun Cho',
              'Yong Soo Cho',
              'Jung-Wan Cho',
              'Hyun-sang Cho',
              'KwangMoon Cho',
              'Nam Ik Cho',
              'Hwan Gue Cho',
              'Kyoung-Rok Cho',
              'TaeHo Cho',
              'W. Cho',
              'Jun-Ki Cho',
              'Dong-Ho Cho',
              'Grun Rae Cho',
              'Yong-Soo Cho',
              'Yong J. Cho',
              'S. H. Cho',
              'S.-B. Cho',
              'SeongJe Cho',
              'Jaeweon Cho',
              'Wonjoon Cho',
              'Sung-Bae Cho',
              'Hyung-Sang Cho',
              'Seon-ku Cho',
   

In [12]:
# Apply Strategy S2
s2_blocks = build_blocks(all_names, extract_initials)
s2_comparisons = count_pairwise_comparisons(s2_blocks)
s2_recall = compute_recall(pairs, s2_blocks)
s2_saving = compute_saving(len(all_names), s2_comparisons)

print(f"🔷 Strategy S2: Initials Blocking")
print(f"Recall: {s2_recall:.4f}")
print(f"Saving: {s2_saving:.4f}")

🔷 Strategy S2: Initials Blocking
Recall: 0.6529
Saving: 0.9988
