# Entity Resolution - Simple Example

## Loading and Splitting the Dataset

In [31]:
import pandas as pd
data = pd.read_csv("./data/restaurants.csv")

In [32]:
data.head(3)

Unnamed: 0,name,address,city,phone,category,cluster
0,arnie morton's of chicago,435 s. la cienega blv.,los angeles,310/246-1501,american,0
1,arnie morton's of chicago,435 s. la cienega blvd.,los angeles,310-246-1501,steakhouses,0
2,art's delicatessen,12224 ventura blvd.,studio city,818/762-1221,american,1


In [65]:
X = data.drop("cluster", axis=1)
y = data["cluster"].to_frame()

## Blocking
- Attribute Equialent (phone)
- Prefix (name)
- Suffix (name)
- Soundex (address)

In [66]:
import jellyfish

def feature_generation(df):
    df = df.copy()
    df["phone"] = df["phone"].str.replace(r'\D', '') # Remove punctuation from 'phone'
    df["name_prefix"] = df["name"].str[0:3]
    df["name_suffix"] = df["name"].str[-3:]
    df["address_soundex"] = df["address"].apply(lambda x: jellyfish.soundex(x))
    return df
    

In [67]:
from recordlinkage.index import Block

def count_candidate_pairs(attr, df):
    blk_phone = Block(attr)
    candidate_pairs = blk_phone.index(df)
    print(f"{attr}: {candidate_pairs.shape} pairs")
    

In [68]:
X_df_blocking = feature_generation(X)

for attr in ["phone", "name_prefix", "name_suffix", "address_soundex"]:
    count_candidate_pairs(attr, X_df_blocking)

phone: (122,) pairs
name_prefix: (1675,) pairs
name_suffix: (2764,) pairs
address_soundex: (2989,) pairs


  df["phone"] = df["phone"].str.replace(r'\D', '') # Remove punctuation from 'phone'


In [69]:
import recordlinkage as rl

indexer = rl.Index(
    [
        Block("phone"),
        Block("name_prefix"),
        Block("name_suffix"),
        Block("address_soundex"),
    ]
)

In [70]:
candidate_pairs = indexer.index(X_df_blocking)

In [71]:
print(f"Number of candidate pairs {candidate_pairs.shape[0]}")

Number of candidate pairs 7177


## Comparing
- name (Jaro-Winkler)
- address (Jaccard)
- city (Jaro-Winkler)
- phone (Jaro-Winkler)
- category (Jaro-Winkler)

In [72]:
from recordlinkage.compare import Exact, String
from recordlinkage.datasets import load_febrl1

comparer = rl.Compare(
    [
        String("name", "name", method="jarowinkler", label="name"),
        String("address", "address", method="jarowinkler", label="address"),
        String("city", "city", method="jarowinkler", label="city"),
        String("phone", "phone", method="jarowinkler", label="phone"),
        String("category", "category", method="jarowinkler", label="category"),
    ]
)

In [73]:
comparison_vector = comparer.compute(candidate_pairs, X)

In [74]:
comparison_vector.head(3)

Unnamed: 0,Unnamed: 1,name,address,city,phone,category
1,0,1.0,0.991304,1.0,0.918687,0.310606
3,2,0.911111,1.0,1.0,0.918687,0.55
5,4,0.589744,1.0,1.0,0.918687,1.0


## Train Classifier

In [99]:
cl = rl.ECMClassifier(binarize=0.1)

y_pred = cl.fit_predict(comparison_vector)

In [None]:
### Prepare target values
RecordLinkage 

In [91]:
import numpy as np
        
def generate_true_matches(df):
    true_matches = []
    for _, group in df.groupby("cluster"): # Groups record matches
        if group.shape[0] >1:
            index = group.index
            combinations = np.array(np.meshgrid(index, index)).T.reshape(-1, 2) # Generate every combination of pairs
            unique_combinations = combinations[combinations[:, 0] < combinations[:, 1]] # Select only unique combinations
            true_match = unique_combinations.tolist()
            true_matches.append(true_match)
    true_matches_np = np.array(true_matches).reshape(-1,2).T
    return pd.MultiIndex.from_arrays(true_matches_np, names=('rec_1', 'rec_2'))

In [92]:
train_true_matches = generate_true_matches(y)
train_true_matches[0:3]

MultiIndex([(0, 1),
            (2, 3),
            (4, 5)],
           names=['rec_1', 'rec_2'])