### Preprocessing

In [1]:
from recordlinkage.standardise import clean
import pandas as pd

df = pd.read_csv('../Data/names_adress.csv')

# Default Cleaning
df["name_clean_default"] = clean(df["name"])

# Clean the `occupation` column, but keep brackets and their contents.
df["occupation_clean"]= clean(df["occupation"],
                              replace_by_none='[^ \\-\\_A-Za-z0-9]+',
                              remove_brackets=False)

In [2]:
df

Unnamed: 0,name,phone_number,occupation,address,name_clean_default,occupation_clean
0,1. Rachel Green,1(613)555 0149,buyer (fashion),"90 Bedford Street, Apt 20",1 rachel green,buyer fashion
1,2. Ross Geller,+1-613-555-0138,paleontogist,"100 Grove Street, Apartment 16",2 ross geller,paleontogist
2,3. Mönica Geller,16135550185,Chef,"90 Bedford Street, Apt 20",3 mnica geller,chef
3,4. Chandler BING,1 613 555 0161,???,"90 Bedford Street, Apt 19",4 chandler bing,
4,5. Pheobe Buffay,1(613)5550114,musician,"5 Morton Street, Apt. 14",5 pheobe buffay,musician
5,6. Joseph (Joey) Tribbiani,1(613)555-0148,actor,"90 Bedford Street, Apt 19",6 joseph tribbiani,actor


### Indexing

In [3]:
import pandas as pd
from recordlinkage import index

# Name data for indexing
names_1 = ['alfred', 'bob', 'calvin', 'hobbes', 'rusty']
names_2 = ['alfred', 'danny', 'callum', 'hobie', 'rusty']

# Convert to DataFrames
df_a = pd.DataFrame({'names': names_1})
df_b = pd.DataFrame({'names': names_2})

In [7]:
indexer = index.Random(n=df_a.shape[0])

In [9]:
indexer.index(df_a, df_b)

MultiIndex([(2, 2),
            (0, 3),
            (3, 0),
            (3, 1),
            (3, 1)],
           )

### Febrl1 dataset

In [53]:
from recordlinkage import datasets

In [54]:
df = datasets.load_febrl1()

In [55]:
df.columns.to_list()

['given_name',
 'surname',
 'street_number',
 'address_1',
 'address_2',
 'suburb',
 'postcode',
 'state',
 'date_of_birth',
 'soc_sec_id']

In [59]:
from recordlinkage import preprocessing
df['phonetic_surname'] = preprocessing.phonetic(df['surname'], 'soundex', concat=True)

In [60]:
indexer = index.Block('phonetic_surname')
candidates = indexer.index(df)

In [65]:
from recordlinkage import Compare
comp = Compare()

comp.string('given_name', 'given_name', method='jarowinkler', label='given_name')
comp.string('surname', 'surname', method='levenshtein', label='surname')
comp.exact('state', 'state', label='state')
comp.string('address_1', 'address_1', method='levenshtein', label='address_1')

<Compare>

In [66]:
features = comp.compute(candidates, df)

In [67]:
features

Unnamed: 0_level_0,Unnamed: 1_level_0,given_name,surname,state,address_1
rec_id_1,rec_id_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rec-264-dup-0,rec-223-org,0.000000,0.833333,0,0.368421
rec-264-org,rec-223-org,0.000000,0.833333,0,0.333333
rec-264-org,rec-264-dup-0,0.000000,1.000000,1,0.947368
rec-419-org,rec-122-org,0.595238,1.000000,0,0.500000
rec-276-org,rec-122-org,0.619048,1.000000,0,0.500000
...,...,...,...,...,...
rec-16-dup-0,rec-16-org,1.000000,1.000000,1,0.818182
rec-318-org,rec-318-dup-0,1.000000,1.000000,1,0.923077
rec-236-org,rec-236-dup-0,1.000000,1.000000,1,1.000000
rec-299-dup-0,rec-299-org,1.000000,0.875000,1,1.000000


In [68]:
features['score'] = features.sum(axis=1)

In [69]:
features

Unnamed: 0_level_0,Unnamed: 1_level_0,given_name,surname,state,address_1,score
rec_id_1,rec_id_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
rec-264-dup-0,rec-223-org,0.000000,0.833333,0,0.368421,1.201754
rec-264-org,rec-223-org,0.000000,0.833333,0,0.333333,1.166667
rec-264-org,rec-264-dup-0,0.000000,1.000000,1,0.947368,2.947368
rec-419-org,rec-122-org,0.595238,1.000000,0,0.500000,2.095238
rec-276-org,rec-122-org,0.619048,1.000000,0,0.500000,2.119048
...,...,...,...,...,...,...
rec-16-dup-0,rec-16-org,1.000000,1.000000,1,0.818182,3.818182
rec-318-org,rec-318-dup-0,1.000000,1.000000,1,0.923077,3.923077
rec-236-org,rec-236-dup-0,1.000000,1.000000,1,1.000000,4.000000
rec-299-dup-0,rec-299-org,1.000000,0.875000,1,1.000000,3.875000
