# Capstone - Restaurant Bubble - YELP API

In [1]:
import os
os.chdir('../')

In [2]:
from urllib import urlencode
from datetime import datetime
from sklearn.externals import joblib
import lib.yelp_api as yp
import lib.county_data as ctd
import pandas as pd
import requests
import json
import re

### I. Split County Data

I'm going to split my data into 10 separate groups, so I can call on the Yelp API without getting in trouble (limit is 25,000).

In [3]:
rest_df = joblib.load('data/rest_group_df.pkl')

In [4]:
rest_df.head()

Unnamed: 0,facility,address,city,zip_cd,num_at_address,max_seats,phone,risk_level,last_inspect_date,inspect_score
0,#1 CAFE,2080 CENTURY PARK E,LOS ANGELES,90067,1,30,+1nan,3,08/18/2016,90.0
1,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,LOS ANGELES,90044,1,30,+12132358779,3,08/10/2016,92.0
2,#1 DONUT,8509 S FIGUEROA ST,LOS ANGELES,90003,2,30,+1nan,2,05/17/2016,98.0
3,#1 DONUTS,8509 S FIGUEROA ST,LOS ANGELES,90003,2,30,+16266957898,2,07/26/2016,91.0
4,#1 JUICY DUMPLING,140 W VALLEY BLVD,SAN GABRIEL,91776,12,60,+16262164207,3,11/02/2016,92.0


In [5]:
split_num = len(rest_df['facility'])/10

In [6]:
for i in range(10):
    split_df = rest_df.iloc[i * split_num:(i + 1) * split_num]
    joblib.dump(split_df, 'data/rest_group_{}_df.pkl'.format(i))

In [7]:
rest_group_0_df = joblib.load('data/rest_group_0_df.pkl')
rest_group_1_df = joblib.load('data/rest_group_1_df.pkl')
rest_group_2_df = joblib.load('data/rest_group_2_df.pkl')
rest_group_3_df = joblib.load('data/rest_group_3_df.pkl')
rest_group_4_df = joblib.load('data/rest_group_4_df.pkl')
rest_group_5_df = joblib.load('data/rest_group_5_df.pkl')
rest_group_6_df = joblib.load('data/rest_group_6_df.pkl')
rest_group_7_df = joblib.load('data/rest_group_7_df.pkl')
rest_group_8_df = joblib.load('data/rest_group_8_df.pkl')
rest_group_9_df = joblib.load('data/rest_group_9_df.pkl')

## <span style="color:teal"> YELP API REQUEST - BUSINESSES </span>

### II. Get initial Yelp data using Business API

Get my parameters ready for my Yelp search.

In [8]:
bearer_token = yp.obtain_bearer_token()

In [9]:
url_search = 'https://api.yelp.com/v3/businesses/search'
headers = {'Authorization': 'Bearer {}'.format(bearer_token)}

In [10]:
current_df = rest_group_0_df

In [11]:
current_df.head()

Unnamed: 0,facility,address,city,zip_cd,num_at_address,max_seats,phone,risk_level,last_inspect_date,inspect_score
0,#1 CAFE,2080 CENTURY PARK E,LOS ANGELES,90067,1,30,+1nan,3,08/18/2016,90.0
1,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,LOS ANGELES,90044,1,30,+12132358779,3,08/10/2016,92.0
2,#1 DONUT,8509 S FIGUEROA ST,LOS ANGELES,90003,2,30,+1nan,2,05/17/2016,98.0
3,#1 DONUTS,8509 S FIGUEROA ST,LOS ANGELES,90003,2,30,+16266957898,2,07/26/2016,91.0
4,#1 JUICY DUMPLING,140 W VALLEY BLVD,SAN GABRIEL,91776,12,60,+16262164207,3,11/02/2016,92.0


In [12]:
current_df['zip_cd'] = current_df['zip_cd'].astype('str')

In [13]:
terms = current_df['facility'].values
locations = (current_df['address'].values + ', ' + current_df['city'].values + ' ' 
             + current_df['zip_cd'].values) + ' CA'

In [14]:
term_loc = zip(terms, locations)

In [15]:
len(term_loc)

3216

### <span style="color:purple"> Making the Request for Business IDs through name and location </span>

- Run through the list of terms and locations to search Yelp, with a limit of 30 returns.
- Remove any searches that came up 'blank' (or, failed) so we can iterate through the dictionary and not worry about empty lists
- Pickle the dictionary so we can maintain the request data in its' purest format as we work with it and make changes

In [17]:
r_dict = {}
unsearchables = []

for i in term_loc:
    url_params = {'location': i[1],
                  'term': i[0],
                  'limit': 30,
                  'locale': 'en_US'}
    try:
        r_dict[i] = dict(requests.request('GET', url_search, 
                                          headers=headers, 
                                          params=url_params).json())['businesses']
    except:
        unsearchables.append(url_params)

In [18]:
yp_search_failures = []
for key, value in r_dict.items():
    if value == []:
        yp_search_failures.append(key)
        r_dict.pop(key, value)

In [20]:
len(yp_search_failures)

298

In [21]:
joblib.dump(r_dict, 'data/yelp_search_dict_v2.pkl')

['data/yelp_search_dict_v2.pkl']

### III. Sift through data from Yelp API and create a meaningful `DataFrame`

Using Yelp's requests, I'm going to search the restaurant name `(terms)` and addresses `(locations)`. I then grab the json object, and sift through to receive the information I want. I'm grabbing the following information:
- business id (a unique id we can use to search Yelp later)
- business name
- price category (options are \$, \$\$, \$\$\$, \$\$\$\$)
- category type (I'm thinking this will help categorize the type of food served, and help determine saturation)
- closed (returns true if closed)
- address, city, zip to cross-check with LA county data
- long, lat (for some nice graphics later)

In [22]:
r_dict = joblib.load('data/yelp_search_dict_v2.pkl')

In [23]:
rest_dict_format = {}

for rest, value in r_dict.items():
    search_name = rest[0]
    search_address = rest[1].split(',')[0]
    for i, x in enumerate(value):
        max_items = i + 1
    for i in range(max_items):
        values = value[i]
        bus_id = values['id']
        name = values['name']
        try:
            price = values['price']
        except:
            price = None
        try:    
            cat_1 = values['categories'][0]['alias']
        except:
            cat_1 = None
        try:
            cat_2 = values['categories'][1]['alias']
        except:
            cat_2 = cat_1
        closed = values['is_closed']
        address = values['location']['address1']
        city = values['location']['city']
        zip_code = values['location']['zip_code']
        latitude = values['coordinates']['latitude']
        longitude = values['coordinates']['longitude']
        indiv_rest_list = [search_name, search_address, bus_id, name, 
                           price, cat_1, cat_2, closed, address, city, 
                           zip_code, latitude, longitude]
        rest_dict_format[bus_id] = indiv_rest_list

Below, I'll maintain a list of addresses that did not come through the search as I sift through the addresses and check Yelp's accuracy of pulling the correct addresses requested.

In [24]:
unknown_addresses = {}

for key, attribute in rest_dict_format.items():
    search_address = attribute[1].split()[0]
    yelp_address = attribute[8]
    try:
        split_address = yelp_address.split()[0]
        if split_address == search_address:
            attribute.append(1)
        else:
            attribute.append(0)
    except:
        unknown_addresses[key] = attribute

In [26]:
yelp_df = pd.DataFrame(rest_dict_format.values(), index = rest_dict_format.keys(), 
                       columns = ['search_name', 'search_address', 'bus_id', 'name', 
                                  'price', 'cat_1', 'cat_2', 'closed','address', 
                                  'city', 'zip_code', 'latitude', 'longitude', 'match'])

### IV. Check Yelp's accuracy in finding the correct restaurants with addresses

In [28]:
yelp_df_match = yelp_df[yelp_df['match'] == 1.0]

In [29]:
match_name = yelp_df_match['search_name'].values
match_address = yelp_df_match['search_address'].values

In [30]:
name_add_searched = set(zip(match_name,match_address))

In [31]:
yelp_matches = len(name_add_searched)

In [32]:
yelp_df_mismatch = yelp_df[yelp_df['match'] == 0.0]

In [33]:
def indicate_match(x, y):
    if x in match_name and y in match_address:
        return 1
    else:
        return 0
    
yelp_df_mismatch['filter'] = yelp_df_mismatch.apply(lambda col: indicate_match(col['search_name'],
                                                                               col['search_address']),axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [77]:
yelp_df_mismatch[yelp_df_mismatch['filter'] == 0].sort_values('search_name')

Unnamed: 0,search_name,search_address,bus_id,name,price,cat_1,cat_2,closed,address,city,zip_code,latitude,longitude,match,filter
panda-express-inglewood,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,panda-express-inglewood,Panda Express,$,chinese,hotdogs,False,3351 W Century Blvd Space 101,Inglewood,90303,33.945930,-118.331108,0.0,0
canton-kitchen-inglewood,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,canton-kitchen-inglewood,Canton Kitchen,$,chinese,chinese,False,402 W Manchester Blvd,Inglewood,90301,33.961553,-118.362511,0.0,0
china-gate-los-angeles,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,china-gate-los-angeles,China Gate,$,chinese,chinese,False,11814 Wilmington Ave,Los Angeles,90059,33.925688,-118.238958,0.0,0
louisiana-famous-fried-chicken-los-angeles-11,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,louisiana-famous-fried-chicken-los-angeles-11,Louisiana Famous Fried Chicken,$,hotdogs,chickenshop,False,4400 S Central Ave,Los Angeles,90011,34.003772,-118.256118,0.0,0
chubby-rice-hawthorne,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,chubby-rice-hawthorne,Chubby Rice,$$,chinese,asianfusion,False,12836 Inglewood Ave,Hawthorne,90250,33.916135,-118.361230,0.0,0
subway-los-angeles-136,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,subway-los-angeles-136,Subway,$,sandwiches,hotdogs,False,12730 S Figueroa St,Los Angeles,90061,33.916744,-118.282356,0.0,0
serees-coffee-shop-los-angeles,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,serees-coffee-shop-los-angeles,Seree's Coffee Shop,$,breakfast_brunch,chinese,False,2800 S Grand Ave,Los Angeles,90007,34.024440,-118.273155,0.0,0
louisiana-famous-fried-chicken-los-angeles-8,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,louisiana-famous-fried-chicken-los-angeles-8,Louisiana Famous Fried Chicken,$,hotdogs,hotdogs,False,12730 S Figueroa,Los Angeles,90061,33.916740,-118.282364,0.0,0
hong-kong-express-los-angeles-16,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,hong-kong-express-los-angeles-16,Hong Kong Express,$,chinese,chinese,False,3330 W Florence Ave,Los Angeles,90043,33.974232,-118.330498,0.0,0
louisiana-fried-chicken-los-angeles-6,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,louisiana-fried-chicken-los-angeles-6,Louisiana Fried Chicken,$,cajun,hotdogs,False,1401 W Manchester Ave,Los Angeles,90047,33.960312,-118.300644,0.0,0


In [34]:
name_re_search = yelp_df_mismatch[yelp_df_mismatch['filter'] == 0]['search_name'].values
address_re_search = yelp_df_mismatch[yelp_df_mismatch['filter'] == 0]['search_address'].values

In [35]:
name_add_re_search = set(zip(name_re_search,address_re_search))

In [36]:
yelp_mis_matches = len(name_add_re_search)

In [37]:
'Yelp accurately returned {}% of our search.'.format(round(yelp_matches*100./(yelp_matches + 
                                                                 yelp_mis_matches),2))

'Yelp accurately returned 40.02% of our search.'

### <span style="color:purple"> Making the Request for Business IDs through a phone number </span>

- Let's not give up on Yelp yet--I've heard their phone search can be fairly accurate, and we actually have that information!
- Using the county data, I'll grab the phone of those mismatches and run it through Yelp again

In [38]:
name_add_re_search_list = []

for i in name_add_re_search:
    name = i[0]
    address = i[1]
    name_add_re_search_list.append((name, address)) 

In [39]:
search_2_df = pd.DataFrame(name_add_re_search_list, columns = ['facility', 'address'])

In [40]:
search_2_join_df = pd.merge(search_2_df, current_df, how='inner')

In [41]:
phone_search_df = search_2_join_df[['facility', 'address', 'phone']]

In [42]:
phone_search_df_ = phone_search_df.groupby(['facility', 'address']).agg({'phone': 'count'})

In [43]:
phone_search_df_[phone_search_df_['phone'] > 1.]

Unnamed: 0_level_0,Unnamed: 1_level_0,phone
facility,address,Unnamed: 2_level_1
BELWOOD BAKERY,8735 TAMPA AVE,2


In [44]:
phone_search_group = phone_search_df.groupby(['facility', 'address'], as_index=False).agg({'phone': 'max'})

In [45]:
facil = phone_search_group['facility'].values
add = phone_search_group['address'].values
phone = phone_search_group['phone'].values

In [46]:
search_tups = zip(facil,add,phone)

In [47]:
search_tups_non_nan = [i for i in search_tups if i[2] != '+1nan']

In [48]:
('We are only able to search {}% of the remaining data \
given that we only have a portion of the phone numbers').format(round(((len(search_tups_non_nan))*100)
                                                                    / len(search_tups)),2)

'We are only able to search 51.0% of the remaining data given that we only have a portion of the phone numbers'

In [49]:
# url_phone = 'https://api.yelp.com/v3/businesses/search/phone'

In [50]:
# r_phone_dict = {}
# unsearchables_phone = []

# for i in search_tups_non_nan:
#     url_params = {'phone': i[2]}
#     for u_ in url_params:
#         try:
#             r_phone_dict[i[0],i[1]] = dict(requests.request('GET', url_phone, 
#                                                             headers=headers, 
#                                                             params=url_params).json())['businesses']
#         except:
#             unsearchables_phone.append(i)

In [51]:
# yp_phone_search_failures = []

# for key, value in r_phone_dict.items():
#     if value == []:
#         yp_phone_search_failures.append(key)
#         r_phone_dict.pop(key, value)

In [52]:
# len(yp_phone_search_failures)

285

In [53]:
# joblib.dump(r_phone_dict, 'data/yelp_search_phone_dict.pkl')

['data/yelp_search_phone_dict.pkl']

In [54]:
r_phone_dict = joblib.load('data/yelp_search_phone_dict.pkl')

In [55]:
rest_phone_dict_format = {}

for rest, value in r_phone_dict.items():
    search_name = rest[0]
    search_address = rest[1].split(',')[0]
    for i, x in enumerate(value):
        max_items = i + 1
    for i in range(max_items):
        values = value[i]
        bus_id = values['id']
        name = values['name']
        try:
            price = values['price']
        except:
            price = None
        try:    
            cat_1 = values['categories'][0]['alias']
        except:
            cat_1 = None
        try:
            cat_2 = values['categories'][1]['alias']
        except:
            cat_2 = cat_1
        closed = values['is_closed']
        address = values['location']['address1']
        city = values['location']['city']
        zip_code = values['location']['zip_code']
        latitude = values['coordinates']['latitude']
        longitude = values['coordinates']['longitude']
        indiv_rest_list = [search_name, search_address, bus_id, name, 
                           price, cat_1, cat_2, closed, address, city, 
                           zip_code, latitude, longitude]
        rest_phone_dict_format[bus_id] = indiv_rest_list

In [57]:
unknown_phones = {}

for key, attribute in rest_phone_dict_format.items():
    search_address = attribute[1].split()[0]
    yelp_address = attribute[8]
    try:
        split_address = yelp_address.split()[0]
        if split_address == search_address:
            attribute.append(1)
        else:
            attribute.append(0)
    except:
        unknown_phones[key] = attribute

In [58]:
yelp_phone_df = pd.DataFrame(rest_phone_dict_format.values(), index = rest_phone_dict_format.keys(), 
                       columns = ['search_name', 'search_address', 'bus_id', 'name', 
                                  'price', 'cat_1', 'cat_2', 'closed','address', 
                                  'city', 'zip_code', 'latitude', 'longitude', 'match'])

In [89]:
yelp_phone_df

Unnamed: 0,search_name,search_address,bus_id,name,price,cat_1,cat_2,closed,address,city,zip_code,latitude,longitude,match
340-restauraunt-and-nightclub-pomona,340 RESTAURANT & NIGHTCLUB,340 S THOMAS ST,340-restauraunt-and-nightclub-pomona,340 Restauraunt & Nightclub,$$,danceclubs,gaybars,False,340 S Thomas St,Pomona,91766,34.056374,-117.750870,1
bites-los-angeles-2,BITES I,10960 WILSHIRE BLVD,bites-los-angeles-2,Bites,$,sandwiches,salad,False,10960 Wilshire Blvd,Los Angeles,90024,34.057552,-118.446152,1
alex-in-the-kitchen-burgers-canoga-park,ALEX IN THE KITCHEN FAMOUS BURGER,21008 SHERMAN WAY,alex-in-the-kitchen-burgers-canoga-park,Alex In the Kitchen Burgers,$,mexican,burgers,True,21008 Sherman Way,Canoga Park,91303,34.200945,-118.590947,1
abc-seafood-los-angeles,ABC RESTAURANT,708 NEW HIGH ST,abc-seafood-los-angeles,ABC Seafood,$$,seafood,seafood,False,205 Ord St,Los Angeles,90012,34.060047,-118.238029,0
wienerschnitzel-torrance-3,BABE'S SOUTHERN GRILL,4509 TORRANCE BLVD,wienerschnitzel-torrance-3,Wienerschnitzel,$,hotdog,hotdog,True,4509 Torrance Blvd,Torrance,90503,33.837704,-118.361511,1
bottlefish-los-angeles-2,BOTTLE FISH,11677 SAN VICENTE BLVD,bottlefish-los-angeles-2,Bottlefish,$$$,bars,seafood,False,11677 San Vicente Blvd,Los Angeles,90049,34.054000,-118.464759,1
novel-cafe-santa-monica-4,BOWL KITCHEN,2127 LINCOLN BLVD,novel-cafe-santa-monica-4,Novel Cafe,$$,sandwiches,coffee,True,2127 Lincoln Blvd,Santa Monica,90405,34.010490,-118.480309,1
brü-haus-los-angeles-2,BRU HAUS,11831 WILSHIRE BLVD,brü-haus-los-angeles-2,BRÜ HAUS,$$,gastropubs,lounges,False,11831 Wilshire Blvd,Los Angeles,90025,34.047770,-118.463540,1
21-bistro-monrovia,21 BISTRO,110 E LIME AVE,21-bistro-monrovia,21 Bistro,$$,french,newamerican,True,110 E Lime St,Monrovia,91016,34.148030,-118.000615,1
atmosphere-cafe-mar-vista-los-angeles,ATMOSPHERE CAFE,12034 VENICE BLVD,atmosphere-cafe-mar-vista-los-angeles,Atmosphere Cafe Mar Vista,$$,salad,breakfast_brunch,False,12034 Venice Blvd,Los Angeles,90066,34.005493,-118.429707,1


### IV. Check Yelp's accuracy in finding the correct restaurants with phone

In [59]:
yelp_phone_df_match = yelp_phone_df[yelp_phone_df['match'] == 1.0]

In [60]:
match_phone_name = yelp_phone_df_match['search_name'].values
match_phone_address = yelp_phone_df_match['search_address'].values

In [61]:
name_add_phone_searched = set(zip(match_phone_name, match_phone_address))

In [62]:
yelp_phone_matches = len(name_add_phone_searched)

In [70]:
yelp_phone_matches

425

In [63]:
yelp_phone_df_mismatch = yelp_phone_df[yelp_phone_df['match'] == 0.0]

In [64]:
def indicate_match(x, y):
    if x in match_phone_name and y in match_phone_address:
        return 1
    else:
        return 0
    
yelp_phone_df_mismatch['filter'] = yelp_phone_df_mismatch.apply(lambda col: indicate_match(col['search_name'],
                                                                               col['search_address']),axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [78]:
name_msmtch = yelp_phone_df_mismatch[yelp_phone_df_mismatch['filter'] == 0]['search_name'].values
address_msmtch = yelp_phone_df_mismatch[yelp_phone_df_mismatch['filter'] == 0]['search_address'].values

In [79]:
name_add_msmtch = set(zip(name_msmtch, address_msmtch))

In [80]:
yelp_phone_mis_matches = len(name_add_msmtch)

In [81]:
'Yelp accurately returned {}% of our search.'.format(round(yelp_phone_matches*100./(yelp_phone_matches + 
                                                                 yelp_phone_mis_matches),2))

'Yelp accurately returned 87.81% of our search.'

In [86]:
yelp_phone_df_mismatch[yelp_phone_df_mismatch['filter'] == 0]

Unnamed: 0,search_name,search_address,bus_id,name,price,cat_1,cat_2,closed,address,city,zip_code,latitude,longitude,match,filter
abc-seafood-los-angeles,ABC RESTAURANT,708 NEW HIGH ST,abc-seafood-los-angeles,ABC Seafood,$$,seafood,seafood,False,205 Ord St,Los Angeles,90012,34.060047,-118.238029,0,0
baskin-robbins-west-covina-2,BASKIN ROBBINS,1014 PLAZA DR,baskin-robbins-west-covina-2,Baskin Robbins,$$,icecream,icecream,False,1200 Plaza Dr,West Covina,91790,34.069751,-117.927649,0,0
borneo-kalimantan-cuisine-alhambra-2,BORNEO KITCHEN,989 S GLENDORA AVE,borneo-kalimantan-cuisine-alhambra-2,Borneo Kalimantan Cuisine,$,indonesian,singaporean,False,19 South Garfield Ave,Alhambra,91801,34.094620,-118.126816,0,0
pizza-rustica-los-angeles-2,9TH STREET & PASTA,227 E 9TH ST,pizza-rustica-los-angeles-2,Pizza Rustica,$,italian,pizza,True,231 E 9th St,Los Angeles,90014,34.040507,-118.253774,0,0
8-esquinas-grill-el-monte,8 ESQUINAS,10612 VALLEY MALL,8-esquinas-grill-el-monte,8 Esquinas Grill,$,mexican,mexican,False,10624-10630 Valley Mall,El Monte,91731,34.076408,-118.040275,0,0
antojitos-alondra-norwalk,ANTOJITOS JAZMIN,15716 1/2 PIONEER BLVD,antojitos-alondra-norwalk,Antojitos Alondra,$,mexican,mexican,False,157161 1/2 Pionner Blvd,Norwalk,90650,33.918652,-118.081886,0,0
a-divine-h2o-los-angeles-2,A DIVINE H2O,14427 VENTURA BLVD,a-divine-h2o-los-angeles-2,A Divine H2O,$$,waterdelivery,juicebars,False,11701 Wilshire Blvd,Los Angeles,90025,34.049442,-118.461472,0,0
bruddahs-hawaiian-foods-gardena,BRUDDAH'S BAR & GRILL,1430 W REDONDO BEACH BLVD,bruddahs-hawaiian-foods-gardena,Bruddah's Hawaiian Foods,$,hawaiian,hawaiian,True,1033 W Gardena Blvd,Gardena,90247,33.881607,-118.292611,0,0
starbucks-torrance-19,BARNES & NOBLE BOOKSELLERS,21500 HAWTHORNE BLVD,starbucks-torrance-19,Starbucks,$,coffee,coffee,False,21400 Hawthorne Blvd,Torrance,90503,33.833519,-118.352196,0,0
university-club-los-angeles-2,BANQUET ROOM 3RD FLOOR,5151 E STATE UNIVERSITY DR,university-club-los-angeles-2,University Club,$$,tradamerican,tradamerican,False,Cal State University Los Angeles,Los Angeles,90032,34.066981,-118.168589,0,0


In [28]:
yelp_final_df['name'] = yelp_final_df['name'].str.replace(u'ñ', 'n')
yelp_final_df['name'] = yelp_final_df['name'].str.replace(u'°C', ' celsius')
yelp_final_df['name'] = yelp_final_df['name'].str.replace(u'Ü', 'u')
yelp_final_df['name'] = yelp_final_df['name'].str.replace(u'ü', 'u')
yelp_final_df['name'] = yelp_final_df['name'].str.replace(u'é', 'e')
yelp_final_df['name'] = yelp_final_df['name'].str.replace(u'è', 'e')
yelp_final_df['name'] = yelp_final_df['name'].str.replace(u'重', '')
yelp_final_df['name'] = yelp_final_df['name'].str.replace(u'庆', '')
yelp_final_df['name'] = yelp_final_df['name'].str.replace(u'小', '')
yelp_final_df['name'] = yelp_final_df['name'].str.replace(u'面', '')
yelp_final_df['name'] = yelp_final_df['name'].str.replace(u'ö', 'o')
yelp_final_df['name'] = yelp_final_df['name'].str.replace(u'á', 'a')
yelp_final_df['name'] = yelp_final_df['name'].str.replace(u'á', 'a')

In [29]:
yelp_final_df['name'] = yelp_final_df['name'].str.replace(' & ', ' and ')
yelp_final_df['search_name'] = yelp_final_df['search_name'].str.replace(' & ', ' and ')

In [30]:
yelp_final_df['search_address'] = yelp_final_df['search_address'].str.replace(' N ', ' ')
yelp_final_df['address'] = yelp_final_df['address'].str.replace(' N ', ' ')
yelp_final_df['search_address'] = yelp_final_df['search_address'].str.replace(' E ', ' ')
yelp_final_df['address'] = yelp_final_df['address'].str.replace(' E ', ' ')
yelp_final_df['search_address'] = yelp_final_df['search_address'].str.replace(' S ', ' ')
yelp_final_df['address'] = yelp_final_df['address'].str.replace(' S ', ' ')
yelp_final_df['search_address'] = yelp_final_df['search_address'].str.replace(' W ', ' ')
yelp_final_df['address'] = yelp_final_df['address'].str.replace(' W ', ' ')

In [None]:
rest_df = joblib.load('data/rest_group_df.pkl')