In [26]:
import pandas as pd 
import numpy as np
from recordlinkage.preprocessing import clean
from recordlinkage.preprocessing import phonetic
from recordlinkage.preprocessing import phonenumbers
import re

In [12]:
df=pd.read_csv("names_adress.csv")

In [13]:
df.head()

Unnamed: 0,name,phone_number,occupation,address
0,1. Rachel Green,1(613)555 0149,buyer (fashion),"90 Bedford Street, Apt 20"
1,2. Ross Geller,+1-613-555-0138,paleontogist,"100 Grove Street, Apartment 16"
2,3. Mönica Geller,16135550185,Chef,"90 Bedford Street, Apt 20"
3,4. Chandler BING,1 613 555 0161,???,"90 Bedford Street, Apt 19"
4,5. Pheobe Buffay,1(613)5550114,musician,"5 Morton Street, Apt. 14"


In [14]:
df["name"]=clean(df["name"])
df["occupation"]=clean(df["occupation"])
df["phone_number"]=phonenumbers(df["phone_number"])

In [20]:
df[["street","house_no"]]=df["address"].str.split(",",expand=True)
df.drop(columns="address",inplace=True)

In [24]:
# Lets replace the empty cell with NaN
df.replace("",np.NaN,inplace=True)

In [27]:
# Lets clean the name using regular expression:
df["name"]=df["name"].apply(lambda x: re.sub(r"\d\s","",x))

In [33]:
df[["first_name","surname"]]=df["name"].str.split(" ",expand=True)
df.drop(columns="name",inplace=True)

In [34]:
df

Unnamed: 0,phone_number,occupation,street,house_no,first_name,surname
0,16135550149,buyer,90 Bedford Street,Apt 20,rachel,green
1,16135550138,paleontogist,100 Grove Street,Apartment 16,ross,geller
2,16135550185,chef,90 Bedford Street,Apt 20,mnica,geller
3,16135550161,,90 Bedford Street,Apt 19,chandler,bing
4,16135550114,musician,5 Morton Street,Apt. 14,pheobe,buffay
5,16135550148,actor,90 Bedford Street,Apt 19,joseph,tribbiani


In [37]:
df["phonetische_kodierung"]=phonetic(df["surname"],"soundex")

In [42]:
df

Unnamed: 0,phone_number,occupation,street,house_no,first_name,surname,phonetische_kodierung
0,16135550149,buyer,90 Bedford Street,Apt 20,rachel,green,G650
1,16135550138,paleontogist,100 Grove Street,Apartment 16,ross,geller,G460
2,16135550185,chef,90 Bedford Street,Apt 20,mnica,geller,G460
3,16135550161,,90 Bedford Street,Apt 19,chandler,bing,B520
4,16135550114,musician,5 Morton Street,Apt. 14,pheobe,buffay,B100
5,16135550148,actor,90 Bedford Street,Apt 19,joseph,tribbiani,T615


##### ``Indexing:``

In [87]:
from recordlinkage import index,datasets,compare
from recordlinkage.preprocessing import phonetic

In [88]:
df=datasets.load_febrl1()

In [89]:
print(df.shape)
df.head()

(1000, 10)


Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-223-org,,waller,6.0,tullaroop street,willaroo,st james,4011,wa,19081209,6988048
rec-122-org,lachlan,berry,69.0,giblin street,killarney,bittern,4814,qld,19990219,7364009
rec-373-org,deakin,sondergeld,48.0,goldfinch circuit,kooltuo,canterbury,2776,vic,19600210,2635962
rec-10-dup-0,kayla,harrington,,maltby circuit,coaling,coolaroo,3465,nsw,19150612,9004242
rec-227-org,luke,purdon,23.0,ramsay place,mirani,garbutt,2260,vic,19831024,8099933


In [90]:
df["phonetic_surname"]=phonetic(df["surname"],"soundex",concat=True)
df.head()

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id,phonetic_surname
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
rec-223-org,,waller,6.0,tullaroop street,willaroo,st james,4011,wa,19081209,6988048,W460
rec-122-org,lachlan,berry,69.0,giblin street,killarney,bittern,4814,qld,19990219,7364009,B600
rec-373-org,deakin,sondergeld,48.0,goldfinch circuit,kooltuo,canterbury,2776,vic,19600210,2635962,S536
rec-10-dup-0,kayla,harrington,,maltby circuit,coaling,coolaroo,3465,nsw,19150612,9004242,H652
rec-227-org,luke,purdon,23.0,ramsay place,mirani,garbutt,2260,vic,19831024,8099933,P635


In [92]:
# This will make me indices where the similarity of the surname are cause I did phonetic on it
indexer=index.Block("phonetic_surname")
canditat=indexer.index(df)
canditat

MultiIndex([('rec-264-dup-0',   'rec-223-org'),
            (  'rec-264-org',   'rec-223-org'),
            (  'rec-264-org', 'rec-264-dup-0'),
            (  'rec-419-org',   'rec-122-org'),
            (  'rec-276-org',   'rec-122-org'),
            (  'rec-276-org',   'rec-419-org'),
            ('rec-276-dup-0',   'rec-122-org'),
            ('rec-276-dup-0',   'rec-419-org'),
            ('rec-276-dup-0',   'rec-276-org'),
            ( 'rec-40-dup-0',   'rec-122-org'),
            ...
            ( 'rec-87-dup-0',    'rec-87-org'),
            (  'rec-104-org', 'rec-104-dup-0'),
            ('rec-310-dup-0',   'rec-310-org'),
            ('rec-298-dup-0',   'rec-298-org'),
            ('rec-345-dup-0',   'rec-345-org'),
            ( 'rec-16-dup-0',    'rec-16-org'),
            (  'rec-318-org', 'rec-318-dup-0'),
            (  'rec-236-org', 'rec-236-dup-0'),
            ('rec-299-dup-0',   'rec-299-org'),
            (  'rec-132-org', 'rec-132-dup-0')],
           names=['rec_