# Entity Resolution - Simple Example

## Loading and Splitting the Dataset

In [58]:
import pandas as pd
data = pd.read_csv("./data/restaurants.csv")

In [59]:
data.index = ["rec_" + str(i) for i in data.index] # Change index
data.index.name = "rec"

In [60]:
data.head(3)

Unnamed: 0_level_0,name,address,city,phone,category,cluster
rec,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
rec_0,arnie morton's of chicago,435 s. la cienega blv.,los angeles,310/246-1501,american,0
rec_1,arnie morton's of chicago,435 s. la cienega blvd.,los angeles,310-246-1501,steakhouses,0
rec_2,art's delicatessen,12224 ventura blvd.,studio city,818/762-1221,american,1


In [61]:
X = data.drop("cluster", axis=1)
y = data["cluster"].to_frame()

## Blocking
- Attribute Equialent (phone)
- Prefix (name)
- Suffix (name)
- Soundex (address)

In [62]:
import jellyfish

def feature_generation(df):
    df = df.copy()
    df["phone"] = df["phone"].str.replace(r'\D', '') # Remove punctuation from 'phone'
    df["name_prefix"] = df["name"].str[0:3]
    df["name_suffix"] = df["name"].str[-3:]
    df["address_soundex"] = df["address"].apply(lambda x: jellyfish.soundex(x))
    return df
    

In [63]:
from recordlinkage.index import Block

def count_candidate_pairs(attr, df):
    blk_phone = Block(attr)
    candidate_pairs = blk_phone.index(df)
    print(f"{attr}: {candidate_pairs.shape} pairs")
    

In [64]:
X_df_blocking = feature_generation(X)

for attr in ["phone", "name_prefix", "name_suffix", "address_soundex"]:
    count_candidate_pairs(attr, X_df_blocking)

phone: (122,) pairs
name_prefix: (1675,) pairs
name_suffix: (2764,) pairs
address_soundex: (2989,) pairs


  df["phone"] = df["phone"].str.replace(r'\D', '') # Remove punctuation from 'phone'


In [141]:
import recordlinkage as rl

indexer = rl.Index(
    [
        Block("phone"),
        Block("name_prefix"),
        Block("name_suffix"),
        Block("address_soundex"),
    ]
)

In [142]:
candidate_pairs = indexer.index(X_df_blocking)

In [143]:
# print(f"Number of candidate pairs {candidate_pairs.shape[0]}")

## Comparing
- name (Jaro-Winkler)
- address (Jaccard)
- city (Jaro-Winkler)
- phone (Jaro-Winkler)
- category (Jaro-Winkler)

In [180]:
from recordlinkage.compare import Exact, String
from recordlinkage.datasets import load_febrl1

comparer = rl.Compare(
    [
        # String("name", "name", method="jarowinkler", label="name"),
        # String("address", "address", method="jarowinkler", label="address"),
        # String("city", "city", method="jarowinkler", label="city"),
    #     String("phone", "phone", method="jarowinkler", label="phone"),
    #     String("category", "category", method="jarowinkler", label="category"),
    ]
)

In [181]:
comparison_vector = comparer.compute(candidate_pairs, X)

In [182]:
len(comparison_vector)

7177

In [183]:
comparison_vector.sample(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,city
rec_1,rec_2,Unnamed: 2_level_1
rec_722,rec_373,0.923077
rec_596,rec_207,1.0
rec_748,rec_353,0.472222
rec_835,rec_197,1.0
rec_389,rec_293,1.0
rec_352,rec_211,0.535256
rec_788,rec_442,0.0
rec_768,rec_47,0.292023
rec_423,rec_211,0.535256
rec_738,rec_441,0.923077


## Train Classifier

In [184]:
ecm = rl.ECMClassifier(binarize=0.1)

y_pred = ecm.fit_predict(comparison_vector)

In [185]:
len(y_pred)

0

### Prepare target values
RecordLinkage 

In [83]:
import numpy as np
        
def generate_true_matches(df):
    true_matches = []
    for _, group in df.groupby("cluster"): # Groups record matches
        if group.shape[0] >1:
            index = group.index
            combinations = np.array(np.meshgrid(index, index)).T.reshape(-1, 2) # Generate every combination of pairs
            unique_combinations = combinations[combinations[:, 0] > combinations[:, 1]] # Select only unique combinations
            true_match = unique_combinations.tolist()
            true_matches.append(true_match)
    true_matches_np = np.array(true_matches).reshape(-1,2).T
    return pd.MultiIndex.from_arrays(true_matches_np, names=('rec_1', 'rec_2'))

In [84]:
y_multi_index = generate_true_matches(y)
y_multi_index[0:10]

MultiIndex([( 'rec_1',  'rec_0'),
            ( 'rec_3',  'rec_2'),
            ( 'rec_5',  'rec_4'),
            ( 'rec_7',  'rec_6'),
            ( 'rec_9',  'rec_8'),
            ('rec_11', 'rec_10'),
            ('rec_13', 'rec_12'),
            ('rec_15', 'rec_14'),
            ('rec_17', 'rec_16'),
            ('rec_19', 'rec_18')],
           names=['rec_1', 'rec_2'])

In [46]:
y_pred

MultiIndex([(  'rec_rec_rec_1',   'rec_rec_rec_0'),
            ('rec_rec_rec_100',  'rec_rec_rec_90'),
            ('rec_rec_rec_100',  'rec_rec_rec_91'),
            ('rec_rec_rec_101', 'rec_rec_rec_100'),
            ('rec_rec_rec_101',  'rec_rec_rec_90'),
            ('rec_rec_rec_101',  'rec_rec_rec_91'),
            ('rec_rec_rec_103', 'rec_rec_rec_102'),
            ('rec_rec_rec_103',  'rec_rec_rec_87'),
            ('rec_rec_rec_104',  'rec_rec_rec_48'),
            ('rec_rec_rec_104',  'rec_rec_rec_49'),
            ...
            ( 'rec_rec_rec_96',  'rec_rec_rec_32'),
            ( 'rec_rec_rec_96',  'rec_rec_rec_33'),
            ( 'rec_rec_rec_97',  'rec_rec_rec_32'),
            ( 'rec_rec_rec_97',  'rec_rec_rec_33'),
            ( 'rec_rec_rec_97',  'rec_rec_rec_96'),
            ( 'rec_rec_rec_98',  'rec_rec_rec_56'),
            ( 'rec_rec_rec_98',  'rec_rec_rec_57'),
            ( 'rec_rec_rec_99',  'rec_rec_rec_56'),
            ( 'rec_rec_rec_99',  'rec_rec_rec_57

In [47]:
### Evaluate

In [48]:
tot_pairs = (len(X)*(len(X)-1))/2
cm = rl.confusion_matrix(y_multi_index, y_pred, total=tot_pairs)

In [49]:
pd.DataFrame(cm, columns=["Predicted Positives", "Predicted Negatives"], index=["True Positives", "True Negatives"])

Unnamed: 0,Predicted Positives,Predicted Negatives
True Positives,112,0
True Negatives,7065,365639
