<a href="https://www.kaggle.com/code/andrapsrin/datasets-join-evaluation?scriptVersionId=170946806" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np 
import pandas as pd 

!pip install recordlinkage

Collecting recordlinkage
  Downloading recordlinkage-0.16-py3-none-any.whl.metadata (8.1 kB)
Collecting jellyfish>=1 (from recordlinkage)
  Downloading jellyfish-1.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.5 kB)
Downloading recordlinkage-0.16-py3-none-any.whl (926 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.9/926.9 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jellyfish-1.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jellyfish, recordlinkage
Successfully installed jellyfish-1.0.3 recordlinkage-0.16


In [2]:
# test entries for facebook_website dataset
test_fd_wd = pd.read_csv('/kaggle/input/accuracy-test/test_fd_wd.csv')
test_fd_wd.head()

Unnamed: 0,city_f_w,country_f_w,name_f_w,region_f_w,domain_f_w
0,baton rouge,united states,lsu,louisiana,lsu.edu
1,letchworth,united kingdom,kwik fit,england,kwik-fit.com
2,scarborough,canada,guild inn estate,ontario,guildinnestate.com
3,laval,canada,signature boisbriand,quebec,developpementsignature.com
4,corinth,united states,mrhc,mississippi,mrhc.org


In [3]:
# test entries for google dataset
test_gd = pd.read_csv('/kaggle/input/accuracy-test/tets_gd.csv')
test_gd.head()

Unnamed: 0,city_g,country_g,name_g,region_g,domain_g
0,gosford,australia,spotlight west gosford,new south wales,spotlightstores.com
1,st catharines,canada,heritage christian book store,ontario,bookmanager.com
2,maumee,united states,maumee kindercare,ohio,kindercare.com
3,houston,united states,energy resourcing,texas,energyresourcing.com
4,sanford,united states,scfd station 34,florida,seminolecountyfl.gov


In [4]:
# ground truth dataset computed manually from the test data (test_fd_wd + test_gd) by the expert (me)
true_pred = pd.read_csv('/kaggle/input/accuracy-test/true_pred.csv')
true_pred.head()

Unnamed: 0,city,country,name,region,domain
0,gosford,australia,spotlight west gosford,new south wales,spotlightstores.com
1,st catharines,canada,heritage christian book store,ontario,bookmanager.com
2,maumee,united states,maumee kindercare,ohio,kindercare.com
3,houston,united states,energy resourcing,texas,energyresourcing.com
4,sanford,united states,scfd station 34,florida,seminolecountyfl.gov


In [5]:
# index setup for merging

import recordlinkage

# add index column for each dataset
test_fd_wd = test_fd_wd.reset_index().rename(columns={'index': 'index_f_w'})
test_gd = test_gd.reset_index().rename(columns={'index': 'index_g'})

# set index column for each dataset
test_fd_wd.set_index('index_f_w', inplace=True)
test_gd.set_index('index_g', inplace=True)

# create indexer object to identify pairs of rows
indexer = recordlinkage.Index()

# apply blocking on 'domain_f_w' and 'domain_g'
indexer.block(left_on='domain_f_w', right_on='domain_g')

# generate candidate links
candidates = indexer.index(test_fd_wd, test_gd)

print(f"Number of potential matches: {len(candidates)}")

Number of potential matches: 14


In [6]:
compare = recordlinkage.Compare()

compare.exact('country_f_w', 'country_g', label='country_score', missing_value=1)
compare.exact('city_f_w', 'city_g', label='city_score', missing_value=1)
compare.exact('region_f_w', 'region_g', label='region_score', missing_value=1)

compare.string('name_f_w',
            'name_g',
            method='jarowinkler',
            threshold=0.85,
            label='name_score',
            missing_value=1)

features = compare.compute(candidates, test_fd_wd, test_gd)

features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,country_score,city_score,region_score,name_score
index_f_w,index_g,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,6,1,1,1,0.0
1,5,1,0,1,1.0
2,7,1,1,1,0.0
3,9,1,0,1,1.0
4,10,1,1,1,0.0


In [7]:
# filter and merge
def count_drop_duplicates(dataFrame):
    # print duplicates before dropping
    duplicates = dataFrame[dataFrame.duplicated()]
    if duplicates.empty:
        print("No duplicates found in final dataset.")
        return
    
    print("Duplicates in final dataset:")
    print(duplicates)

    dataFrame = dataFrame.drop_duplicates()
    
    print(f"\nNumber of duplicates dropped in final dataset: {len(duplicates)}")

filtered = features[features.sum(axis=1) >= 4]
filtered.reset_index(inplace=True) 

merged = pd.merge(filtered, test_fd_wd, left_on='index_f_w', right_index=True, how='outer')
merged = pd.merge(merged, test_gd, left_on='index_g', right_index=True, how='outer')    

count_drop_duplicates(merged)

No duplicates found in final dataset.


In [8]:
# drop index and score columns
merged = merged.drop(['index_f_w', 'index_g', 'country_score', 'region_score', 'name_score', 'city_score'], axis=1)

# merge common columns (_f_w and _g)
merged['city'] = merged['city_f_w'].combine_first(merged['city_g'])
merged['country'] = merged['country_f_w'].combine_first(merged['country_g'])
merged['region'] = merged['region_f_w'].combine_first(merged['region_g'])
merged['domain'] = merged['domain_f_w'].combine_first(merged['domain_g'])

# keep f_w 'phone' column (or google if f_w is null)
merged['name'] = merged['name_f_w'].fillna(merged['name_g'])

# drop columns (_f_w and _g)
merged = merged.drop(['city_f_w', 'country_f_w', 'name_f_w', 'region_f_w', 'domain_f_w',
                      'city_g', 'country_g', 'name_g', 'region_g', 'domain_g'], axis=1)

# reset index
merged.reset_index(inplace=True)
merged = merged.drop(['index'], axis=1)

merged.head(50)

Unnamed: 0,city,country,region,domain,name
0,gosford,australia,new south wales,spotlightstores.com,spotlight west gosford
1,st catharines,canada,ontario,bookmanager.com,heritage christian book store
2,maumee,united states,ohio,kindercare.com,maumee kindercare
3,houston,united states,texas,energyresourcing.com,energy resourcing
4,sanford,united states,florida,seminolecountyfl.gov,scfd station 34
5,bradford,united kingdom,england,kwik-fit.com,kwik fit bradford thornbury
6,baton rouge,united states,louisiana,lsu.edu,louisiana state university
7,scarborough,canada,ontario,guildinnestate.com,the guild inn estate
8,stratford,canada,ontario,jarfh.com,james a rutherford funeral home
9,boisbriand,canada,quebec,developpementsignature.com,signature boisbriand


In [9]:
# reorder columns
true_pred = true_pred[['city', 'country', 'region', 'domain', 'name']]
true_pred.head(50)

Unnamed: 0,city,country,region,domain,name
0,gosford,australia,new south wales,spotlightstores.com,spotlight west gosford
1,st catharines,canada,ontario,bookmanager.com,heritage christian book store
2,maumee,united states,ohio,kindercare.com,maumee kindercare
3,houston,united states,texas,energyresourcing.com,energy resourcing
4,sanford,united states,florida,seminolecountyfl.gov,scfd station 34
5,bradford,united kingdom,england,kwik-fit.com,kwik fit bradford thornbury
6,baton rouge,united states,louisiana,lsu.edu,louisiana state university
7,stratford,canada,ontario,jarfh.com,james a rutherford funeral home
8,boisbriand,canada,quebec,developpementsignature.com,signature boisbriand
9,corinth,united states,mississippi,mrhc.org,magnolia regional health center emergency room


In [10]:
# Evaluation of joining algorithm

ground_truth_joined_dataset = true_pred
join_algorithm_final_dataset = merged


# convert each dataFrame into a set of tuples (the rows are tuples now)
ground_truth_set = set([tuple(x) for x in ground_truth_joined_dataset.to_numpy()])
algorithm_output_set = set([tuple(x) for x in join_algorithm_final_dataset.to_numpy()])

# compute true positives (tp), false positives (fp), and false negatives (fn)
tp = len(ground_truth_set & algorithm_output_set)
fp = len(algorithm_output_set - ground_truth_set)
fn = len(ground_truth_set - algorithm_output_set)

print("True positive count: ", tp)
print("False positive count: ", fp)
print("False negative count: ", fn)

# compute precision, recall and F1 score
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"\nPrecision Score: {int(precision * 100)}%")
print(f"Recall Score: {int(recall * 100)}%")
print(f"F1 Score: {int(f1 * 100)}%")

True positive count:  41
False positive count:  4
False negative count:  0

Precision Score: 91%
Recall Score: 100%
F1 Score: 95%
