# 2.00 - Group 0 - Yelp API - Search

In [1]:
import os
os.chdir('../../')

In [2]:
from sklearn.externals import joblib
import lib.yelp_api as yp
import pandas as pd
import requests
import json

### <span style="color:teal"> Load in Group </span>

In [3]:
rest_group_df = joblib.load('data/la_rest_group_df.pkl')

In [4]:
rest_group_df.head(1)

Unnamed: 0,facility,address,city,zip_cd,num_at_address,max_seats,phone,risk_level,last_inspect_date,inspect_score,cat,closed_est,year
0,#1 CAFE,2080 CENTURY PARK E,LOS ANGELES,90067,1,30,+1nan,3,2016-08-18,90.0,,False,2016


## <span style="color:teal"> YELP API REQUEST </span>

#### <span style="color:teal"> Functions </span>

- Create our Yelp search list
    - `yp.create_searches`
- Yelp Search
    - `yp.yelp_api_calls`
- Select necessary Yelp parameters
    - `yp.format_srch_dict`
- Check that Yelp search matches at least one (out of 30) of Yelp's returned matches, based on the digits of the address
    - `yp.pick_correct_matches`

#### <span style="color:teal"> Process </span>

- Run through the list of terms and locations to search Yelp, with a limit of 30 returns.
- Remove any searches that came up 'blank' (or, failed) so we can iterate through the dictionary and not worry about empty lists
- Pickle the dictionary so we can maintain the request data in its' purest format as we work with it and make changes

Using Yelp's requests, I'm going to search the restaurant name `(terms)` and addresses `(locations)`. I then grab the json object, and sift through to receive the information I want. I'm grabbing the following information:
- business id (a unique id we can use to search Yelp later)
- business name
- price category (options are \$, \$\$, \$\$\$, \$\$\$\$)
- category type (I'm thinking this will help categorize the type of food served, and help determine saturation)
- closed (returns true if closed)
- address, city, zip to cross-check with LA county data
- long, lat (for some nice graphics later)

### <span style="color:purple"> Phone Requests </span>

In [5]:
phone_searches = yp.create_searches(rest_group_df, 'phone')

In [6]:
# phn_srch_dict, unsrchbl_phn = yp.yelp_api_calls(phone_searches, 'phone')

In [7]:
# yp_phn_srch_fail = yp.find_failed_searches(11, phn_srch_dict, 'phone')

In [8]:
phn_srch_dict = yp.load_srch_dict(11, 'phone')

In [9]:
yp.return_proccess_tracker()

{'0_Total DataFrame': (9094, '100%'),
 '1_Restaurants with Phone Provided': (4613, '51.0%')}

In [10]:
phn_frmt_dict = yp.format_srch_dict(phn_srch_dict, 'phone')

In [11]:
unknwn_phn = yp.pick_correct_matches(phn_frmt_dict, 'phone')

In [12]:
phn_srch_df = pd.DataFrame(phn_frmt_dict.values(), index = phn_frmt_dict.keys(),
                              columns = ['search_name', 'search_address', 'bus_id', 'name', 
                                         'price', 'cat_1', 'cat_2', 'closed','address',
                                         'city', 'zip_code', 'latitude', 'longitude', 'match'])

phn_srch_df.sort_values('search_name').head(1)

Unnamed: 0,search_name,search_address,bus_id,name,price,cat_1,cat_2,closed,address,city,zip_code,latitude,longitude,match
general-cow-korean-bbq-los-angeles,#2 MOON BBQ,478 N WESTERN AVE,general-cow-korean-bbq-los-angeles,General Cow Korean BBQ,$$,korean,korean,True,480 N Western Ave,Los Angeles,90004,34.079647,-118.309013,0.0


### <span style="color:teal"> Term/Location Requests </span>

In [13]:
trm_loc_searches = yp.create_searches(rest_group_df, 'term_loc', optional_data=phn_srch_df)

In [14]:
yp.return_proccess_tracker()

{'0_Total DataFrame': (9094, '100%'),
 '1_Restaurants with Phone Provided': (4613, '51.0%'),
 '3_Successful Matches from Phone Searches': (2350, '26.0%'),
 '4_Remaining Restaurants to Search with Term/Location': (6737, '74.0%')}

In [15]:
trm_loc_srch_dict, unsrchbl_trm_loc = yp.yelp_api_calls(trm_loc_searches, 'term_loc')

In [16]:
trm_loc_srch_fail = yp.find_failed_searches(11, trm_loc_srch_dict, 'term_loc')

In [17]:
trm_loc_srch_dict = yp.load_srch_dict(11, 'term_loc')

In [18]:
yp.return_proccess_tracker()

{'0_Total DataFrame': (9094, '100%'),
 '1_Restaurants with Phone Provided': (4613, '51.0%'),
 '3_Successful Matches from Phone Searches': (2350, '26.0%'),
 '4_Remaining Restaurants to Search with Term/Location': (6737, '74.0%'),
 '5_Restaurants with Successful Term/Location Searches': (5861, '64.0%')}

In [19]:
trm_loc_frmt_dict = yp.format_srch_dict(trm_loc_srch_dict, 'term_loc')

In [20]:
unknwn_trm_loc = yp.pick_correct_matches(trm_loc_frmt_dict, 'term_loc')

In [21]:
yp.return_proccess_tracker()

{'0_Total DataFrame': (9094, '100%'),
 '1_Restaurants with Phone Provided': (4613, '51.0%'),
 '3_Successful Matches from Phone Searches': (2350, '26.0%'),
 '4_Remaining Restaurants to Search with Term/Location': (6737, '74.0%'),
 '5_Restaurants with Successful Term/Location Searches': (5861, '64.0%'),
 '6_Successful Matches from Term/Loc Searches': (1183, '13.0%'),
 '7_Successful Matches all Searches': (3533, '39.0%')}

In [22]:
trm_loc_srch_df = pd.DataFrame(trm_loc_frmt_dict.values(), index = trm_loc_frmt_dict.keys(),
                              columns = ['search_name', 'search_address', 'bus_id', 'name', 
                                         'price', 'cat_1', 'cat_2', 'closed','address',
                                         'city', 'zip_code', 'latitude', 'longitude', 'match'])

trm_loc_srch_df.sort_values('search_name').head(1)

Unnamed: 0,search_name,search_address,bus_id,name,price,cat_1,cat_2,closed,address,city,zip_code,latitude,longitude,match
us-chinese-food-inglewood,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,us-chinese-food-inglewood,US Chinese Food,,chinese,chinese,False,3240 W Century Blvd,Inglewood,90303,33.945335,-118.328347,0.0


### <span style="color:teal"> Concatenate DataFrames </span>

In [23]:
all_srch_rslts = pd.concat([trm_loc_srch_df, phn_srch_df])

In [24]:
grpd_mtch_srch_rslts = all_srch_rslts[all_srch_rslts['match'] == 1.0].groupby(['search_name', 
                                                                               'search_address', 
                                                                               'name'],
                                                                               as_index=False,
                                                                               squeeze=True).agg({'bus_id': 'count'})
grpd_mtch_srch_rslts[grpd_mtch_srch_rslts['bus_id'] > 1]

Unnamed: 0,search_name,search_address,name,bus_id
318,BLACK ROSE TAVERN/ THE JOINT,8771 W PICO BLVD,The Joint,2
795,COUNT POTATO,1125 W 6TH ST,Count Potato,2
925,DONUT KING,1601 E IMPERIAL HWY,Donut King,2
1448,HOMEGIRL CAFE,130 W BRUNO ST,Homeboy Industries,2
1467,HONG KONG EXPRESS,3330 W FLORENCE AVE,Hong Kong Express,2
1477,HOP LOUIE,950 MEI LING WAY,Hop Louie,2
1635,JOANIE AND LEIGHS CAKES,2323 ROSCOMARE RD,Joanie & Leigh's Cakes,2
1751,KING EDDY SALOON,131 E 5TH ST,King Eddy Saloon,2
2307,MILENA BAKERY,5216 W SUNSET BLVD,Goldstar Bakery,2
2448,NEW WORLD BUFFET,1419 E GAGE AVE,New World Buffet,2


### <span style="color:teal"> Read in Yelp Categories </span>

In [25]:
categories = pd.read_json('data/categories.json')

In [26]:
cat_alias = categories['alias'].values
cat_prnts = categories['parents'].values

cat_list = zip(cat_alias, cat_prnts)

In [27]:
cat_dict = {}

for i in cat_list:
    cat_alias = i[0]
    cat_parent = i[1]
    if cat_parent == []:
        pass
    else:
        cat_dict[cat_alias] = cat_parent

In [28]:
rest_food_dict = {}

for key, value in cat_dict.items():
    if 'restaurants' in value:
        rest_food_dict[key] = value

In [29]:
relevant_food_cat = ['coffee','delicatessen','bagels','milkshakebars',
                     'poke','shavedice','bento','tea','bakeries','donuts',
                     'gourmet','desserts','cupcakes','churros','empanadas',
                     'smokehouse','pretzels','acaibowls','icecream','gelato',
                     'ramen']

for cat in relevant_food_cat:
    rest_food_dict[cat] = ['food']

### <span style="color:teal"> Trim DataFrame to Only Represent Restaurants and Food </span>

In [30]:
def filter_food_rest(x):
    if x in rest_food_dict:
        return 1
    else:
        return 0

all_srch_rslts['rst_fd_1'] = all_srch_rslts['cat_1'].apply(filter_food_rest)
all_srch_rslts['rst_fd_2'] = all_srch_rslts['cat_2'].apply(filter_food_rest)
all_srch_rslts['rst_fd'] = all_srch_rslts['rst_fd_2'] + all_srch_rslts['rst_fd_1']

In [31]:
mtch_srch_rslts = all_srch_rslts[(all_srch_rslts['match'] == 1) & 
                                 (all_srch_rslts['rst_fd'] > 0)]

In [32]:
mtch_srch_rslts = mtch_srch_rslts[['search_name', 'search_address', 'bus_id',
                                   'name', 'price', 'cat_1', 'cat_2', 'closed',
                                   'address', 'city', 'zip_code', 'latitude',
                                   'longitude']]

In [34]:
mtch_srch_rslts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3501 entries, puerto-angelito-los-angeles to the-big-burrito-los-angeles
Data columns (total 13 columns):
search_name       3501 non-null object
search_address    3501 non-null object
bus_id            3501 non-null object
name              3501 non-null object
price             3305 non-null object
cat_1             3501 non-null object
cat_2             3501 non-null object
closed            3501 non-null bool
address           3501 non-null object
city              3501 non-null object
zip_code          3501 non-null object
latitude          3501 non-null float64
longitude         3501 non-null float64
dtypes: bool(1), float64(2), object(10)
memory usage: 359.0+ KB


In [37]:
sum(mtch_srch_rslts['closed'])*1.0/len(mtch_srch_rslts['closed'])

0.2262210796915167

In [35]:
mtch_srch_rslts.head()

Unnamed: 0,search_name,search_address,bus_id,name,price,cat_1,cat_2,closed,address,city,zip_code,latitude,longitude
puerto-angelito-los-angeles,PUERTO ANGELITO MEXICAN FOOD,4626 AVALON BLVD,puerto-angelito-los-angeles,Puerto Angelito,,mexican,mexican,False,4626 Avalon Blvd,Los Angeles,90011,34.001254,-118.265125
moo-greek-yogurt-los-angeles,MOO YOGURT,11753 SAN VICENTE BLVD,moo-greek-yogurt-los-angeles,Moo Greek Yogurt,$,icecream,gelato,False,11753 San Vicente Blvd,Los Angeles,90049,34.053408,-118.467877
nak-won-restaurant-los-angeles,NAKWON CATERING,4564 W BEVERLY BLVD,nak-won-restaurant-los-angeles,Nak Won Restaurant,,korean,korean,False,4564 Beverly Blvd,Los Angeles,90004,34.076215,-118.308341
little-caesars-los-angeles-23,LITTLE CAESARS,5053 E WHITTIER BLVD,little-caesars-los-angeles-23,Little Caesars,$,pizza,pizza,False,5053 Whittier Blvd,Los Angeles,90022,34.021225,-118.160461
tom-n-toms-los-angeles-12,TOM N TOM,440 S VERMONT AVE,tom-n-toms-los-angeles-12,Tom N Toms,$,cafes,cafes,False,440 S Vermont Ave,Los Angeles,90020,34.065788,-118.291077


In [41]:
pd.to_pickle(mtch_srch_rslts, 'data/mtch_srch_rslts_pd.pkl')