# Comparing strings

In [7]:
from thefuzz import fuzz
from thefuzz import process
import pandas as pd

In [3]:
fuzz.WRatio('Reeding', 'Reading')

86

In [9]:
string = 'Houston Rockets vs Los Angeles Lakers'

choice = pd.Series(['Rockets vs Lakers', 'Lakers vs Rockets', 'Houston vs Los Angeles', 'Heat vs Bulls'])

process.extract(string, choice, limit = 2)


[('Rockets vs Lakers', 86, 0), ('Lakers vs Rockets', 86, 1)]

In [16]:
restaurants = pd.read_csv('data/restaurants_L2_dirty.csv')
unique_types = restaurants['type'].unique()
unique_types

array(['american', 'californian', 'japanese', 'cajun/creole', 'hot dogs',
       'diners', 'delis', 'hamburgers', 'seafood', 'italian',
       'coffee shops', 'russian', 'steakhouses', 'mexican/tex-mex',
       'noodle shops', 'mexican', 'middle eastern', 'asian', 'vietnamese',
       'health food', 'american ( new )', 'pacific new wave',
       'indonesian', 'eclectic', 'chicken', 'fast food', 'southern/soul',
       'coffeebar', 'continental', 'french ( new )', 'desserts',
       'chinese', 'pizza'], dtype=object)

In [20]:
print(process.extract('asian', unique_types, limit = len(unique_types)))
print(process.extract('american', unique_types, limit = len(unique_types)))
print(process.extract('italian', unique_types, limit = len(unique_types)))

[('asian', 100), ('indonesian', 80), ('californian', 68), ('italian', 67), ('russian', 67), ('american', 62), ('japanese', 54), ('mexican/tex-mex', 54), ('american ( new )', 54), ('mexican', 50), ('fast food', 45), ('middle eastern', 43), ('steakhouses', 40), ('pacific new wave', 40), ('pizza', 40), ('diners', 36), ('cajun/creole', 36), ('vietnamese', 36), ('continental', 36), ('seafood', 33), ('chicken', 33), ('chinese', 33), ('hot dogs', 30), ('hamburgers', 30), ('coffee shops', 30), ('noodle shops', 30), ('southern/soul', 30), ('desserts', 30), ('eclectic', 26), ('coffeebar', 26), ('health food', 22), ('french ( new )', 22), ('delis', 20)]
[('american', 100), ('american ( new )', 90), ('mexican', 80), ('mexican/tex-mex', 72), ('asian', 62), ('italian', 53), ('russian', 53), ('californian', 53), ('middle eastern', 51), ('southern/soul', 47), ('pacific new wave', 45), ('hamburgers', 44), ('indonesian', 44), ('cajun/creole', 42), ('chicken', 40), ('pizza', 40), ('japanese', 38), ('ecle

In [29]:

categories = ['italian', 'american', 'asian']
for cuisine in categories:
    matches = process.extract(cuisine, restaurants['type'], limit = len(restaurants['type']))
    for match in matches:
        if match[1] >= 86:
            restaurants.loc[restaurants['type']== match[0],'type'] = cuisine
        
restaurants['type'].unique()

array(['american', 'californian', 'japanese', 'cajun/creole', 'hot dogs',
       'diners', 'delis', 'hamburgers', 'seafood', 'italian',
       'coffee shops', 'russian', 'steakhouses', 'mexican/tex-mex',
       'noodle shops', 'mexican', 'middle eastern', 'asian', 'vietnamese',
       'health food', 'pacific new wave', 'indonesian', 'eclectic',
       'chicken', 'fast food', 'southern/soul', 'coffeebar',
       'continental', 'french ( new )', 'desserts', 'chinese', 'pizza'],
      dtype=object)

# Generating pairs

In [34]:
import recordlinkage

restaurants_new = pd.read_csv('data/restaurants_new.csv')

In [35]:
indexer = recordlinkage.Index()
indexer.block('type')
pairs = indexer.index(restaurants, restaurants_new)

In [37]:
pairs

MultiIndex([( 0,  0),
            ( 0,  1),
            ( 0,  6),
            ( 0, 12),
            ( 0, 19),
            ( 0, 20),
            ( 0, 25),
            ( 0, 27),
            ( 0, 30),
            ( 0, 44),
            ...
            (81, 12),
            (81, 19),
            (81, 20),
            (81, 25),
            (81, 27),
            (81, 30),
            (81, 44),
            (81, 51),
            (81, 53),
            (81, 64)],
           length=384)

In [39]:
comp_cl = recordlinkage.Compare()

comp_cl.exact('city', 'city', label = 'city')
comp_cl.exact('type', 'type', label = 'type')


comp_cl.string('name', 'name', label = 'name', threshold = 0.8)

<Compare>

In [40]:
potential_matches = comp_cl.compute(pairs, restaurants, restaurants_new)
print(potential_matches)

       city  type  name
0  0      1     1   1.0
   1      0     1   0.0
   6      0     1   0.0
   12     0     1   0.0
   19     0     1   0.0
...     ...   ...   ...
81 30     0     1   0.0
   44     0     1   0.0
   51     0     1   0.0
   53     0     1   0.0
   64     0     1   1.0

[384 rows x 3 columns]


# Linking DataFrames

In [41]:
matches = potential_matches[potential_matches.sum(axis=1) >= 3]
print(matches)

       city  type  name
0  0      1     1   1.0
1  1      1     1   1.0
2  2      1     1   1.0
3  3      1     1   1.0
4  4      1     1   1.0
77 60     1     1   1.0
78 61     1     1   1.0
79 62     1     1   1.0
80 63     1     1   1.0


In [43]:
matching_indices = matches.index.get_level_values(1)
matching_indices

Index([0, 1, 2, 3, 4, 60, 61, 62, 63], dtype='int64')

In [51]:
non_dup = restaurants_new[~restaurants_new.index.isin(matching_indices)]
full_restaurants = pd.concat([restaurants, non_dup])
full_restaurants = full_restaurants.drop(columns=['Unnamed: 0'])
full_restaurants

Unnamed: 0,name,addr,city,phone,type
0,kokomo,6333 w. third st.,la,2139330773,american
1,feenix,8358 sunset blvd. west,hollywood,2138486677,american
2,parkway,510 s. arroyo pkwy .,pasadena,8187951001,californian
3,r-23,923 e. third st.,los angeles,2136877178,japanese
4,gumbo,6333 w. third st.,la,2139330358,cajun/creole
...,...,...,...,...,...
56,eggslut,317 s. broadway,los angeles,2136250292,breakfast
57,baco mercat,408 s. main st.,los angeles,2136878808,spanish
58,hungry cat,7200 melrose ave.,los angeles,3239510926,seafood
59,sushi zo,334 s. main st.,los angeles,2139359265,japanese
