In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install networkx rapidfuzz python-Levenshtein


Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.3-py3-none-any.whl.metadata (3.9 kB)
Collecting Levenshtein==0.27.3 (from python-Levenshtein)
  Downloading levenshtein-0.27.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.7 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_levenshtein-0.27.3-py3-none-any.whl (9.5 kB)
Downloading levenshtein-0.27.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully 

In [3]:
#deduplicate pipeline
#install libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_recall_fscore_support
from rapidfuzz import fuzz
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from itertools import combinations
from collections import defaultdict
import math
import re

In [10]:
#Helper functions
def normalize_text(s):
  if pd.isna(s): return ""
  s = str(s).lower()
  s = re.sub(r'[^a-z0-9]+','',s)
  s = re.sub(r'\s+','',s).strip()
  return s

In [11]:
def jaccard_tokens(a,b):
  sa = set(a.split())
  sb = set(b.split())
  if not sa and not sb: return 1.0
  if not sa or not sb: return 0.0
  return len(sa&sb)/len(sa|sb)


In [12]:
#creating sample dataset

def make_demo():
  data = pd.read_csv("/content/drive/MyDrive/DS_Datasets/Buy.csv")
  #data = [
  #    {"id": "1", "title": "Learning from Data", "authors": "Y. Lecun", "year": "2013", "price": "" },
   #     {"id": "2", "title": "Learning from Data", "authors": "Yann LeCun", "year": "2013", "price": "" },
    #    {"id": "3", "title": "Deep Learning", "authors": "I. Goodfellow", "year": "2016", "price": "" },
     #   {"id": "4", "title": "Learning from Data: a practical approach", "authors": "Y. Le Cun", "year": "2013", "price": "" }
  #]
  return pd.DataFrame(data)

In [13]:
df = make_demo()

In [14]:
df.head()

Unnamed: 0,id,name,description,manufacturer,price
0,10011646,Linksys EtherFast EZXS88W Ethernet Switch - EZ...,Linksys EtherFast 8-Port 10/100 Switch (New/Wo...,LINKSYS,
1,10140760,Linksys EtherFast EZXS55W Ethernet Switch,5 x 10/100Base-TX LAN,LINKSYS,
2,10221960,Netgear ProSafe FS105 Ethernet Switch - FS105NA,NETGEAR FS105 Prosafe 5 Port 10/100 Desktop Sw...,Netgear,
3,10246269,Belkin Pro Series High Integrity VGA/SVGA Moni...,1 x HD-15 - 1 x HD-15 - 10ft - Beige,Belkin,
4,10315184,Netgear ProSafe JFS516 Ethernet Switch,Netgear ProSafe 16 Port 10/100 Rackmount Switc...,Netgear,


In [15]:

#df['title_norm'] = df['title'].map(normalize_text)
df['title_norm'] = df['name'].map(normalize_text)

#df['authors_norm'] = df['authors'].map(normalize_text)
df['authors_norm'] = df['description'].map(normalize_text)

In [16]:
df.head()

Unnamed: 0,id,name,description,manufacturer,price,title_norm,authors_norm
0,10011646,Linksys EtherFast EZXS88W Ethernet Switch - EZ...,Linksys EtherFast 8-Port 10/100 Switch (New/Wo...,LINKSYS,,linksysetherfastezxs88wethernetswitchezxs88w,linksysetherfast8port10100switchnewworkgroup
1,10140760,Linksys EtherFast EZXS55W Ethernet Switch,5 x 10/100Base-TX LAN,LINKSYS,,linksysetherfastezxs55wethernetswitch,5x10100basetxlan
2,10221960,Netgear ProSafe FS105 Ethernet Switch - FS105NA,NETGEAR FS105 Prosafe 5 Port 10/100 Desktop Sw...,Netgear,,netgearprosafefs105ethernetswitchfs105na,netgearfs105prosafe5port10100desktopswitch
3,10246269,Belkin Pro Series High Integrity VGA/SVGA Moni...,1 x HD-15 - 1 x HD-15 - 10ft - Beige,Belkin,,belkinproserieshighintegrityvgasvgamonitorexte...,1xhd151xhd1510ftbeige
4,10315184,Netgear ProSafe JFS516 Ethernet Switch,Netgear ProSafe 16 Port 10/100 Rackmount Switc...,Netgear,,netgearprosafejfs516ethernetswitch,netgearprosafe16port10100rackmountswitchjfs516na


In [17]:
# blocking simple token-block by first word of title
blocks = defaultdict(list)
for idx, row in df.iterrows():
  key = (row['title_norm'].split()[0] if row['title_norm'] else "")
  blocks[key].append(row['id'])

In [18]:
blocks

defaultdict(list,
            {'linksysetherfastezxs88wethernetswitchezxs88w': [10011646],
             'linksysetherfastezxs55wethernetswitch': [10140760],
             'netgearprosafefs105ethernetswitchfs105na': [10221960],
             'belkinproserieshighintegrityvgasvgamonitorextensioncablef3h98210': [10246269],
             'netgearprosafejfs516ethernetswitch': [10315184],
             'laciepocketfloppydiskdrive706018': [10316920],
             'canonkp36ipprintcartridgepaperkit7737a001': [10326220],
             'kensingtonorbitopticaltrackballusbwps2adapter64327': [10333368],
             'linksysetherfastef4116ethernetswitch': [10333846],
             'linksysetherfastef4124ethernetswitch': [10333848],
             'linksysinstantgigabiteg005wethernetswitch': [10343605],
             'linksysgwirelessgwet54gwirelessbridge': [10346525],
             'tripplitepowerverter375wattultracompactinverterpv375': [10351869],
             'netgearprosafegs105ethernetswitchgs105na': [103

In [19]:
# Produce candidate pairs with in each block
pairs = []
for key,ids in blocks.items():
  for a, b in combinations(ids, 2):
    pairs.append((a, b))


In [20]:
pairs

[(205561996, 205562000),
 (205985718, 205985719),
 (208114672, 208114673),
 (208114672, 208114674),
 (208114672, 208114675),
 (208114673, 208114674),
 (208114673, 208114675),
 (208114674, 208114675),
 (208117929, 208117930),
 (208117929, 208117931),
 (208117929, 208117932),
 (208117929, 208117933),
 (208117930, 208117931),
 (208117930, 208117932),
 (208117930, 208117933),
 (208117931, 208117932),
 (208117931, 208117933),
 (208117932, 208117933),
 (208117936, 208117937),
 (208117936, 208117938),
 (208117937, 208117938),
 (208156877, 208156878),
 (208156877, 208156879),
 (208156878, 208156879)]

In [21]:
#map id -> row for quick loop
rows = {r['id']: r for r in df.to_dict('records')}

In [22]:
rows

{10011646: {'id': 10011646,
  'name': 'Linksys EtherFast EZXS88W Ethernet Switch - EZXS88W',
  'description': 'Linksys EtherFast 8-Port 10/100 Switch (New/Workgroup)',
  'manufacturer': 'LINKSYS',
  'price': nan,
  'title_norm': 'linksysetherfastezxs88wethernetswitchezxs88w',
  'authors_norm': 'linksysetherfast8port10100switchnewworkgroup'},
 10140760: {'id': 10140760,
  'name': 'Linksys EtherFast EZXS55W Ethernet Switch',
  'description': '5 x 10/100Base-TX LAN',
  'manufacturer': 'LINKSYS',
  'price': nan,
  'title_norm': 'linksysetherfastezxs55wethernetswitch',
  'authors_norm': '5x10100basetxlan'},
 10221960: {'id': 10221960,
  'name': 'Netgear ProSafe FS105 Ethernet Switch - FS105NA',
  'description': 'NETGEAR FS105 Prosafe 5 Port 10/100 Desktop Switch',
  'manufacturer': 'Netgear',
  'price': nan,
  'title_norm': 'netgearprosafefs105ethernetswitchfs105na',
  'authors_norm': 'netgearfs105prosafe5port10100desktopswitch'},
 10246269: {'id': 10246269,
  'name': 'Belkin Pro Series Hig

In [23]:
# ---- Feature generation for each pair ----
def pair_features(a_id, b_id):
    a = rows[a_id]; b = rows[b_id]
    t1 = a['title_norm']; t2 = b['title_norm']
    au1 = a['authors_norm']; au2 = b['authors_norm']
    # Token Jaccard
    f_jacc_title = jaccard_tokens(t1, t2)
    f_jacc_auth = jaccard_tokens(au1, au2)
    # Fuzzy ratios
    f_fuzz_ratio = fuzz.ratio(t1, t2) / 100.0
    f_partial = fuzz.partial_ratio(t1, t2) / 100.0
    # Numeric difference (year)
    try:
        yr1 = int(a.get('year') or 0)
        yr2 = int(b.get('year') or 0)
        f_year_diff = abs(yr1 - yr2)
    except:
        f_year_diff = 999
    # TF-IDF char n-gram (we can compute vectorized outside for scale; using simple char-based sim here)
    # As an approximation: cosine on simple token counts
    vec = TfidfVectorizer(analyzer='char', ngram_range=(2,4))
    # Fit on the two strings only (for demo). In practice fit on entire column once.
    X = vec.fit_transform([t1, t2])
    f_cosine = float(cosine_similarity(X[0], X[1])[0,0])
    return {
        'id_a': a_id, 'id_b': b_id,
        'jacc_title': f_jacc_title,
        'jacc_auth': f_jacc_auth,
        'fuzz_ratio': f_fuzz_ratio,
        'fuzz_partial': f_partial,
        'year_diff': f_year_diff,
        'cosine_char': f_cosine
    }


In [24]:
feature_rows = [pair_features(a,b) for a,b in pairs]
feat_df = pd.DataFrame(feature_rows)
print('Feature_DF')
print(feat_df)

Feature_DF
         id_a       id_b  jacc_title  jacc_auth  fuzz_ratio  fuzz_partial  \
0   205561996  205562000         1.0        0.0         1.0           1.0   
1   205985718  205985719         1.0        1.0         1.0           1.0   
2   208114672  208114673         1.0        1.0         1.0           1.0   
3   208114672  208114674         1.0        1.0         1.0           1.0   
4   208114672  208114675         1.0        1.0         1.0           1.0   
5   208114673  208114674         1.0        1.0         1.0           1.0   
6   208114673  208114675         1.0        1.0         1.0           1.0   
7   208114674  208114675         1.0        1.0         1.0           1.0   
8   208117929  208117930         1.0        1.0         1.0           1.0   
9   208117929  208117931         1.0        1.0         1.0           1.0   
10  208117929  208117932         1.0        1.0         1.0           1.0   
11  208117929  208117933         1.0        1.0         1.0      

In [25]:
# ---- Labels (for supervised datasets) ----
# If dataset provides pair labels, use them. For demo we create labels manually:
# Let's mark pairs (1,2) and (1,4) as matches in demo
labels = []
for r in feature_rows:
  a,b = r['id_a'],r['id_b']
  match = 1 if ( (a=='1' and b in ('2','4')) or (a=='2' and b=='4') ) else 0
  labels.append(match)
feat_df['label'] = labels


In [26]:
feat_df

Unnamed: 0,id_a,id_b,jacc_title,jacc_auth,fuzz_ratio,fuzz_partial,year_diff,cosine_char,label
0,205561996,205562000,1.0,0.0,1.0,1.0,0,1.0,0
1,205985718,205985719,1.0,1.0,1.0,1.0,0,1.0,0
2,208114672,208114673,1.0,1.0,1.0,1.0,0,1.0,0
3,208114672,208114674,1.0,1.0,1.0,1.0,0,1.0,0
4,208114672,208114675,1.0,1.0,1.0,1.0,0,1.0,0
5,208114673,208114674,1.0,1.0,1.0,1.0,0,1.0,0
6,208114673,208114675,1.0,1.0,1.0,1.0,0,1.0,0
7,208114674,208114675,1.0,1.0,1.0,1.0,0,1.0,0
8,208117929,208117930,1.0,1.0,1.0,1.0,0,1.0,0
9,208117929,208117931,1.0,1.0,1.0,1.0,0,1.0,0


In [28]:
#Train/Test
X = feat_df.drop(columns=['id_a','id_b','label'])
y = feat_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.4)
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
#if len(y.unique()) > 1:
 #   X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.4)
  #  clf = RandomForestClassifier(n_estimators=200, random_state=42)
   # clf.fit(X_train, y_train)
    #y_pred = clf.predict(X_test)
    #print(classification_report(y_test, y_pred))
#else:
 # print("Not enough label variety in demo. Use a real ER dataset with pair labels")


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [29]:
# In production: compute features for all blocked candidate pairs and use clf.predict_proba to score.
# For demo, we'll assume threshold 0.5 on clf if present.
if 'clf' in locals():
  probs = clf.predict_proba(X)[:,1]
  feat_df['score'] = probs
else:
  feat_df['score'] = 0.0
  TH = 0.5
  matches = feat_df[feat_df['score'] >= TH]

IndexError: index 1 is out of bounds for axis 1 with size 1

In [23]:
# Build graph and find connected components
G = nx.Graph()
G.add_nodes_from(df['id'].tolist())
for _,row in matches.iterrows():
  G.add_edge(row['id_a'],row['id_b'])
components = list(nx.connected_components(G))
print("Clusters(connected components):",components)

#Deduplication reduction metric
orig_count = df.shape[0]
unique_after = len(components)
reduction = 1 - (unique_after/orig_count)
print(f"Records original:{orig_count}, unique after dedupe:{unique_after}, reduction:{reduction:.2%}")

#save dedup mapping
cluster_map = {}
for i, comp in enumerate(components):
  for rid in comp:
    cluster_map[id] = f"cluster{i}"
df['cluster'] = df['id'].map(cluster_map)
print(df)

Clusters(connected components): [{'1'}, {'2'}, {'3'}, {'4'}]
Records original:4, unique after dedupe:4, reduction:0.00%
  id                                     title        authors  year price  \
0  1                        Learning from Data       Y. Lecun  2013         
1  2                        Learning from Data     Yann LeCun  2013         
2  3                             Deep Learning  I. Goodfellow  2016         
3  4  Learning from Data: a practical approach      Y. Le Cun  2013         

                           title_norm authors_norm cluster  
0                    learningfromdata       ylecun     NaN  
1                    learningfromdata    yannlecun     NaN  
2                        deeplearning  igoodfellow     NaN  
3  learningfromdataapracticalapproach       ylecun     NaN  
