# Capstone - Restaurant Bubble - YELP API

In [1]:
import os
os.chdir('../')

In [2]:
from urllib import urlencode
from datetime import datetime
from sklearn.externals import joblib
import lib.yelp_api as yp
import requests
import pandas as pd
import json

### I. Split County Data

I'm going to split my data into 10 separate groups, so I can call on the Yelp API without getting in trouble (limit is 25,000).

In [3]:
rest_df = joblib.load('data/rest_group_df.pkl')

In [4]:
rest_df[rest_df['facility'].str.contains('BUDDY CHICKEN')]

Unnamed: 0,facility,address,city,zip_cd,num_at_address,max_seats,risk_level,last_inspect_date,inspect_score
3210,BUDDY CHICKEN,400 S BALDWIN AVE,ARCADIA,91007,68,30,3,12/06/2016,95.0


In [5]:
rest_df.head()

Unnamed: 0,facility,address,city,zip_cd,num_at_address,max_seats,risk_level,last_inspect_date,inspect_score
0,#1 CAFE,2080 CENTURY PARK E,LOS ANGELES,90067,1,30,3,08/18/2016,90.0
1,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,LOS ANGELES,90044,1,30,3,08/10/2016,92.0
2,#1 DONUT,8509 S FIGUEROA ST,LOS ANGELES,90003,2,30,2,05/17/2016,98.0
3,#1 DONUTS,8509 S FIGUEROA ST,LOS ANGELES,90003,2,30,2,07/26/2016,91.0
4,#1. JUICY DUMPLING,140 W VALLEY BLVD,SAN GABRIEL,91776,12,60,3,11/02/2016,92.0


In [6]:
split_num = len(rest_df['facility'])/10

In [7]:
for i in range(10):
    split_df = rest_df.iloc[i * split_num:(i + 1) * split_num]
    joblib.dump(split_df, 'data/rest_group_{}_df.pkl'.format(i))

In [8]:
rest_group_0_df = joblib.load('data/rest_group_0_df.pkl')
rest_group_1_df = joblib.load('data/rest_group_1_df.pkl')
rest_group_2_df = joblib.load('data/rest_group_2_df.pkl')
rest_group_3_df = joblib.load('data/rest_group_3_df.pkl')
rest_group_4_df = joblib.load('data/rest_group_4_df.pkl')
rest_group_5_df = joblib.load('data/rest_group_5_df.pkl')
rest_group_6_df = joblib.load('data/rest_group_6_df.pkl')
rest_group_7_df = joblib.load('data/rest_group_7_df.pkl')
rest_group_8_df = joblib.load('data/rest_group_8_df.pkl')
rest_group_9_df = joblib.load('data/rest_group_9_df.pkl')

## <span style="color:teal"> YELP API REQUEST - BUSINESSES </span>

In [9]:
bearer_token = yp.obtain_bearer_token()

### II. Get Yelp data using Business API

Using Yelp's requests, I'm going to search the restaurant name `(terms)` and addresses `(locations)`. I then grab the json object, and sift through to receive the information I want. I'm grabbing the following information:
- business id (a unique id we can use to search Yelp later)
- business name
- price category (options are \$, \$\$, \$\$\$, \$\$\$\$)
- category type (I'm thinking this will help categorize the type of food served, and help determine saturation)
- closed (returns true if closed)
- address, city, zip to cross-check with LA county data
- long, lat (for some nice graphics later)

In [10]:
url = 'https://api.yelp.com/v3/businesses/search'
headers = {'Authorization': 'Bearer {}'.format(bearer_token)}

In [11]:
current_df = rest_group_0_df

In [12]:
current_df.head()

Unnamed: 0,facility,address,city,zip_cd,num_at_address,max_seats,risk_level,last_inspect_date,inspect_score
0,#1 CAFE,2080 CENTURY PARK E,LOS ANGELES,90067,1,30,3,08/18/2016,90.0
1,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,LOS ANGELES,90044,1,30,3,08/10/2016,92.0
2,#1 DONUT,8509 S FIGUEROA ST,LOS ANGELES,90003,2,30,2,05/17/2016,98.0
3,#1 DONUTS,8509 S FIGUEROA ST,LOS ANGELES,90003,2,30,2,07/26/2016,91.0
4,#1. JUICY DUMPLING,140 W VALLEY BLVD,SAN GABRIEL,91776,12,60,3,11/02/2016,92.0


In [13]:
current_df['zip_cd'] = current_df['zip_cd'].astype('str')

In [14]:
terms = current_df['facility'].values
locations = (current_df['address'].values + ', ' + current_df['city'].values + ' ' 
             + current_df['zip_cd'].values) + ' CA'

In [15]:
term_loc = zip(terms, locations)

In [16]:
len(term_loc)

3219

### <span style="color:teal"> TESTING YELP API REQUEST - BUSINESSES </span>

In [102]:
location_test = ['3280 HELMS DR, CULVER CITY']
term_test = ['BUCATO']

In [103]:
test_term_loc = zip(location_test, term_test)

In [104]:
test_term_loc

[('3280 HELMS DR, CULVER CITY', 'BUCATO')]

In [109]:
r_test_dict = {}
unsearchables = []

for i in test_term_loc:
    url_params = {'location': i[0],
                  'term': i[1],
                  'limit': 50,
                  'locale': 'en_US',
                  'distance': 'sort_by'}
    try:
        r_test_dict[i] = dict(requests.request('GET', url, headers=headers, params=url_params).json())['businesses']
    except:
        unsearchables.append(url_params)

In [111]:
rest_test_dict_format

{u'ago-restaurant-west-hollywood': ['3280 HELMS DR, CULVER CITY',
  'BUCATO',
  u'ago-restaurant-west-hollywood',
  u'AGO Restaurant',
  u'$$$',
  u'italian',
  u'italian',
  False,
  u'8478 Melrose Ave',
  u'West Hollywood',
  u'90069',
  34.082187,
  -118.375556],
 u'angelini-osteria-los-angeles': ['3280 HELMS DR, CULVER CITY',
  'BUCATO',
  u'angelini-osteria-los-angeles',
  u'Angelini Osteria',
  u'$$$',
  u'italian',
  u'seafood',
  False,
  u'7313 Beverly Blvd',
  u'Los Angeles',
  u'90036',
  34.07642,
  -118.34909],
 u'brunello-trattoria-culver-city': ['3280 HELMS DR, CULVER CITY',
  'BUCATO',
  u'brunello-trattoria-culver-city',
  u'Brunello Trattoria',
  u'$$',
  u'italian',
  u'pizza',
  False,
  u'6001 Washington Blvd',
  u'Culver City',
  u'90232',
  34.03203,
  -118.37636],
 u'centanni-trattoria-venice': ['3280 HELMS DR, CULVER CITY',
  'BUCATO',
  u'centanni-trattoria-venice',
  u'Centanni Trattoria',
  u'$$',
  u'italian',
  u'italian',
  False,
  u'1700 Lincoln Blvd',


In [110]:
rest_test_dict_format = {}

for rest, value in r_test_dict.items():
    search_name = rest[0]
    search_address = rest[1].split(',')[0]
    for i, x in enumerate(value):
        max_items = i + 1
    for i in range(max_items):
        values = value[i]
        bus_id = values['id']
        name = values['name']
        try:
            price = values['price']
        except:
            price = None
        cat_1 = values['categories'][0]['alias']
        try:
            cat_2 = values['categories'][1]['alias']
        except:
            cat_2 = cat_1
        closed = values['is_closed']
        address = values['location']['address1']
        city = values['location']['city']
        zip_code = values['location']['zip_code']
        latitude = values['coordinates']['latitude']
        longitude = values['coordinates']['longitude']
        indiv_rest_list = [search_name, search_address, bus_id, name, 
                           price, cat_1, cat_2, closed, address, city, 
                           zip_code, latitude, longitude]
        rest_test_dict_format[bus_id] = indiv_rest_list

In [27]:
for key, attribute in rest_test_dict_format.items():
    search_address = attribute[1]
    yelp_address = attribute[8]
    split_address = yelp_address.split()
    perc_matches = []
    for word in split_address:
        word = word.upper()
        if word in search_address:
            perc_matches.append(1./len(split_address))
    total_match = sum(perc_matches)
    attribute.append(total_match)

In [28]:
yelp_test_df = pd.DataFrame(rest_test_dict_format.values(), index = rest_test_dict_format.keys(), 
                       columns = ['search_name', 'search_address', 'bus_id', 'name', 
                                  'price', 'cat_1', 'cat_2', 'closed','address', 
                                  'city', 'zip_code', 'latitude', 'longitude', 'match_perct'])

In [29]:
yelp_test_best_match_df = yelp_test_df.groupby(['search_name', 'search_address'], 
                                               as_index=False).agg({'match_perct': 'max'})
yelp_test_best_match_df

Unnamed: 0,search_name,search_address,match_perct
0,"659 S MARIPOSA AVE, LOS ANGELES 90010",BRASS MONKEY INC,0.333333


In [30]:
yelp_test_final_df = pd.merge(yelp_test_best_match_df, yelp_test_df)

In [31]:
yelp_test_final_df

Unnamed: 0,search_name,search_address,match_perct,bus_id,name,price,cat_1,cat_2,closed,address,city,zip_code,latitude,longitude
0,"659 S MARIPOSA AVE, LOS ANGELES 90010",BRASS MONKEY INC,0.333333,pizza-king-council-bluffs,Pizza King,$$,pizza,seafood,False,1101 N Broadway,Council Bluffs,51503,41.269813,-95.83631


### <span style="color:teal"> ACTUAL YELP API REQUEST - BUSINESSES </span>

In [17]:
r_dict = {}
unsearchables = []

for i in term_loc:
    url_params = {'location': i[1],
                  'term': i[0],
                  'limit': 10,
                  'locale': 'en_US'}
    try:
        r_dict[i] = dict(requests.request('GET', url, headers=headers, params=url_params).json())['businesses']
    except:
        unsearchables.append(url_params)

In [18]:
yp_search_failures = []
for key, value in r_dict.items():
    if value == []:
        yp_search_failures.append(key)
        r_dict.pop(key, value)

In [19]:
# joblib.dump(r_dict, 'data/yelp_search_dict.pkl')

['data/yelp_search_dict.pkl']

In [18]:
r_dict = joblib.load('data/yelp_search_dict.pkl')

In [19]:
rest_dict_format = {}

for rest, value in r_dict.items():
    search_name = rest[0]
    search_address = rest[1].split(',')[0]
    for i, x in enumerate(value):
        max_items = i + 1
    for i in range(max_items):
        values = value[i]
        bus_id = values['id']
        name = values['name']
        try:
            price = values['price']
        except:
            price = None
        try:    
            cat_1 = values['categories'][0]['alias']
        except:
            cat_1 = None
        try:
            cat_2 = values['categories'][1]['alias']
        except:
            cat_2 = cat_1
        closed = values['is_closed']
        address = values['location']['address1']
        city = values['location']['city']
        zip_code = values['location']['zip_code']
        latitude = values['coordinates']['latitude']
        longitude = values['coordinates']['longitude']
        indiv_rest_list = [search_name, search_address, bus_id, name, 
                           price, cat_1, cat_2, closed, address, city, 
                           zip_code, latitude, longitude]
        rest_dict_format[bus_id] = indiv_rest_list

In [20]:
unknown_addresses = {}

for key, attribute in rest_dict_format.items():
    search_address = attribute[1]
    yelp_address = attribute[8]
    try:
        split_address = yelp_address.split()
        perc_matches = []
        for word in split_address:
            word = word.upper()
            if word in search_address and len(word) > 1:
                perc_matches.append(1./len(split_address))
        total_match = sum(perc_matches)
        attribute.append(total_match)
    except:
        unknown_addresses[key] = attribute

In [21]:
yelp_df = pd.DataFrame(rest_dict_format.values(), index = rest_dict_format.keys(), 
                       columns = ['search_name', 'search_address', 'bus_id', 'name', 
                                  'price', 'cat_1', 'cat_2', 'closed','address', 
                                  'city', 'zip_code', 'latitude', 'longitude', 'match_perct'])

In [22]:
yelp_best_match_df = yelp_df.groupby(['search_name', 'search_address'], 
                                               as_index=False).agg({'match_perct': 'max'})
yelp_best_match_df.head()

Unnamed: 0,search_name,search_address,match_perct
0,#1 CAFE,2080 CENTURY PARK E,0.75
1,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,0.25
2,#1 DONUT,8509 S FIGUEROA ST,0.333333
3,#1 DONUTS,8509 S FIGUEROA ST,0.25
4,#1. JUICY DUMPLING,140 W VALLEY BLVD,0.75


In [23]:
yelp_final_df = pd.merge(yelp_best_match_df, yelp_df)

In [24]:
this_string = ['s', 'so', 'so', 'here']
that_string = 'here is my string'
for word in this_string:
    if word in that_string:
        print word
    

s
here


In [25]:
yelp_final_df

Unnamed: 0,search_name,search_address,match_perct,bus_id,name,price,cat_1,cat_2,closed,address,city,zip_code,latitude,longitude
0,#1 CAFE,2080 CENTURY PARK E,0.750000,one-cafe-los-angeles-2,One Cafe,$$,cafes,cafes,False,2080 Century Park E,Los Angeles,90067,34.058765,-118.412033
1,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,0.250000,louisiana-fried-chicken-los-angeles-6,Louisiana Fried Chicken,$,cajun,hotdogs,False,1401 W Manchester Ave,Los Angeles,90047,33.960312,-118.300644
2,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,0.250000,china-bowl-los-angeles-2,China Bowl,$,chinese,chinese,False,1810 W Slauson Ave,Los Angeles,90047,33.988170,-118.310040
3,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,0.250000,louisiana-famous-fried-chicken-los-angeles-11,Louisiana Famous Fried Chicken,$,hotdogs,chickenshop,False,4400 S Central Ave,Los Angeles,90011,34.003772,-118.256118
4,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,0.250000,chus-garden-inglewood,Chu's Garden,$,chinese,chinese,False,1101 S Prairie Ave,Inglewood,90301,33.948760,-118.344150
5,#1 DONUT,8509 S FIGUEROA ST,0.333333,number-one-delicious-donut-canoga-park,Number One Delicious Donut,$,donuts,donuts,False,20103 Saticoy St,Canoga Park,91306,34.208911,-118.571687
6,#1 DONUT,8509 S FIGUEROA ST,0.333333,u-s-1-donut-shop-north-hollywood,U S 1 Donut Shop,$,donuts,donuts,True,12510 Vanowen St,North Hollywood,91605,34.193610,-118.405630
7,#1 DONUT,8509 S FIGUEROA ST,0.333333,yum-rich-donut-shop-no-1-winnetka,Yum-Rich Donut Shop No 1,$,donuts,donuts,False,20129 Vanowen St,Winnetka,91306,34.194191,-118.571884
8,#1 DONUTS,8509 S FIGUEROA ST,0.250000,m-and-t-donuts-los-angeles,M & T Donuts,$,donuts,donuts,False,1601 E 103rd St,Los Angeles,90002,33.943750,-118.245540
9,#1. JUICY DUMPLING,140 W VALLEY BLVD,0.750000,long-xing-ji-san-gabriel-2,Long Xing Ji,$$,shanghainese,shanghainese,False,140 W Valley Blvd,San Gabriel,91776,34.078164,-118.102018


# old stuff

In [19]:
rest_dict_format = {}

for rest in r_dict.items():
    search_name = rest[0][0]
    search_address = rest[0][1].split(',')[0]
    values = rest[1][0]
    bus_id = values['id']
    name = values['name']
    try:
        price = values['price']
    except:
        price = None
    cat_1 = values['categories'][0]['alias']
    try:
        cat_2 = values['categories'][1]['alias']
    except:
        cat_2 = cat_1
    closed = values['is_closed']
    address = values['location']['address1']
    city = values['location']['city']
    zip_code = values['location']['zip_code']
    latitude = values['coordinates']['latitude']
    longitude = values['coordinates']['longitude']
    indiv_rest_list = [search_name, search_address, bus_id, name, 
                       price, cat_1, cat_2, closed, address, city, 
                       zip_code, latitude, longitude]
    rest_dict_format[bus_id] = indiv_rest_list

In [35]:
yelp_df = pd.DataFrame(rest_dict_format.values(), index = rest_dict_format.keys(), 
                       columns = ['search_name', 'search_address', 'bus_id', 'name', 
                                  'price', 'cat_1', 'cat_2', 'closed','address', 
                                  'city', 'zip_code', 'latitude', 'longitude'])

AssertionError: 13 columns passed, passed data had 15 columns

In [21]:
yelp_df.head()

Unnamed: 0,search_name,search_address,bus_id,name,price,cat_1,cat_2,closed,address,city,zip_code,latitude,longitude
edibol-los-angeles,BOL,300 S SANTA FE AVE,edibol-los-angeles,ediBOL,$$,newamerican,breakfast_brunch,False,300 S Santa Fe Ave,Los Angeles,90013,34.044875,-118.232614
jims-fallbrook-market-woodland-hills,ALBERTSONS DELI,22840 VICTORY BLVD,jims-fallbrook-market-woodland-hills,Jim's Fallbrook Market,$$,grocery,seafoodmarkets,False,5947 Fallbrook Ave,Woodland Hills,91367,34.17877,-118.62371
brew-you-los-angeles,BREW YOU COFFEE,808 S WESTERN AVE,brew-you-los-angeles,Brew You,$,coffee,icecream,False,808 S Western Ave,Los Angeles,90005,34.05699,-118.30869
bottlefish-los-angeles-2,BOTTLE FISH,11677 SAN VICENTE BLVD,bottlefish-los-angeles-2,Bottlefish,$$$,bars,seafood,False,11677 San Vicente Blvd,Los Angeles,90049,34.054,-118.464759
blue-moon-lounge-montrose,BLUE MOON LOUNGE,3509 N VERDUGO RD,blue-moon-lounge-montrose,Blue Moon Lounge,$,lounges,sportsbars,False,3509 N Verdugo Rd,Montrose,91208,34.202984,-118.22686


In [22]:
joblib.dump(yelp_df, 'data/yelp_df.pkl')

['data/yelp_df.pkl']

## <span style="color:teal"> YELP SEARCH ERRORS </span>

Number of Yelp searches that did not produce any results

In [23]:
len(yp_search_failures)

274

In [24]:
yelp_df = joblib.load('data/yelp_df.pkl')

Naive matching percentages

In [31]:
yelp_final_df

Unnamed: 0,search_name,search_address,match_perct,bus_id,name,price,cat_1,cat_2,closed,address,city,zip_code,latitude,longitude,name_match,address_match,name_address_match
0,#1 CAFE,2080 CENTURY PARK E,0.750000,one-cafe-los-angeles-2,One Cafe,$$,cafes,cafes,False,2080 Century Park E,Los Angeles,90067,34.058765,-118.412033,,,
1,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,0.250000,louisiana-fried-chicken-los-angeles-6,Louisiana Fried Chicken,$,cajun,hotdogs,False,1401 W Manchester Ave,Los Angeles,90047,33.960312,-118.300644,,,
2,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,0.250000,china-bowl-los-angeles-2,China Bowl,$,chinese,chinese,False,1810 W Slauson Ave,Los Angeles,90047,33.988170,-118.310040,,,
3,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,0.250000,louisiana-famous-fried-chicken-los-angeles-11,Louisiana Famous Fried Chicken,$,hotdogs,chickenshop,False,4400 S Central Ave,Los Angeles,90011,34.003772,-118.256118,,,
4,#1 CHINESE FAST FOOD,8606 S VERMONT AVE,0.250000,chus-garden-inglewood,Chu's Garden,$,chinese,chinese,False,1101 S Prairie Ave,Inglewood,90301,33.948760,-118.344150,,,
5,#1 DONUT,8509 S FIGUEROA ST,0.333333,number-one-delicious-donut-canoga-park,Number One Delicious Donut,$,donuts,donuts,False,20103 Saticoy St,Canoga Park,91306,34.208911,-118.571687,,,
6,#1 DONUT,8509 S FIGUEROA ST,0.333333,u-s-1-donut-shop-north-hollywood,U S 1 Donut Shop,$,donuts,donuts,True,12510 Vanowen St,North Hollywood,91605,34.193610,-118.405630,,,
7,#1 DONUT,8509 S FIGUEROA ST,0.333333,yum-rich-donut-shop-no-1-winnetka,Yum-Rich Donut Shop No 1,$,donuts,donuts,False,20129 Vanowen St,Winnetka,91306,34.194191,-118.571884,,,
8,#1 DONUTS,8509 S FIGUEROA ST,0.250000,m-and-t-donuts-los-angeles,M & T Donuts,$,donuts,donuts,False,1601 E 103rd St,Los Angeles,90002,33.943750,-118.245540,,,
9,#1. JUICY DUMPLING,140 W VALLEY BLVD,0.750000,long-xing-ji-san-gabriel-2,Long Xing Ji,$$,shanghainese,shanghainese,False,140 W Valley Blvd,San Gabriel,91776,34.078164,-118.102018,,,


In [32]:
def compare_search(x, y):
    try:
        x = x.lower()
        y = y.lower()
    except:
        x = x
        y = y
    if x == y:
        return 1
    else:
        return 0
    
yelp_final_df['name_match'] = yelp_final_df.apply(lambda col: compare_search(col['search_name'],
                                                                 col['name']), axis = 1)

yelp_final_df['address_match'] = yelp_final_df.apply(lambda col: compare_search(col['search_address'],
                                                                    col['address']), axis = 1)

In [33]:
yelp_final_df['name_address_match'] = yelp_final_df['name_match'] + yelp_final_df['address_match']

In [34]:
name_match = sum(yelp_final_df['name_match'])
address_match = sum(yelp_final_df['address_match'])
name_address_match = sum(yelp_final_df[yelp_final_df['name_address_match'] == 2]['name_address_match'])

In [35]:
print 'name matches: {} ({}% of the data with valid searches)'.format(name_match, 
                                                                      round((name_match*100.)/
                                                                      len(yelp_final_df['name_match']),2))
print 'address matches: {} ({}%)'.format(address_match, 
                                         round((address_match*100.)/
                                         len(yelp_final_df['address_match']),2))
print 'name and address matches: {} ({}%)'.format(name_address_match, 
                                         round((name_address_match*100.)/
                                         len(yelp_final_df['name_address_match']),2))

name matches: 666 (17.39% of the data with valid searches)
address matches: 1006 (26.27%)
name and address matches: 872 (22.77%)


In [29]:
yelp_df['name'] = yelp_df['name'].str.replace('&', 'and', case = False)
yelp_df['search_name'] = yelp_df['search_name'].str.replace('&', 'and', case = False)

In [30]:
print 'name matches: {} ({}% of the data with valid searches)'.format(name_match, 
                                                                      round((name_match*100.)/
                                                                      len(yelp_df['name_match']),2))
print 'address matches: {} ({}%)'.format(address_match, 
                                         round((address_match*100.)/
                                         len(yelp_df['address_match']),2))
print 'name and address matches: {} ({}%)'.format(name_address_match, 
                                         round((name_address_match*100.)/
                                         len(yelp_df['name_address_match']),2))

name matches: 829 (38.19% of the data with valid searches)
address matches: 1112 (51.22%)
name and address matches: 1116 (51.4%)


In [35]:
yelp_df

Unnamed: 0,search_name,search_address,bus_id,name,price,cat_1,cat_2,closed,address,city,zip_code,latitude,longitude,name_match,address_match,name_address_match
edibol-los-angeles,BOL,300 S SANTA FE AVE,edibol-los-angeles,ediBOL,$$,newamerican,breakfast_brunch,False,300 S Santa Fe Ave,Los Angeles,90013,34.044875,-118.232614,0,1,1
jims-fallbrook-market-woodland-hills,ALBERTSONS DELI,22840 VICTORY BLVD,jims-fallbrook-market-woodland-hills,Jim's Fallbrook Market,$$,grocery,seafoodmarkets,False,5947 Fallbrook Ave,Woodland Hills,91367,34.178770,-118.623710,0,0,0
brew-you-los-angeles,BREW YOU COFFEE,808 S WESTERN AVE,brew-you-los-angeles,Brew You,$,coffee,icecream,False,808 S Western Ave,Los Angeles,90005,34.056990,-118.308690,0,1,1
bottlefish-los-angeles-2,BOTTLE FISH,11677 SAN VICENTE BLVD,bottlefish-los-angeles-2,Bottlefish,$$$,bars,seafood,False,11677 San Vicente Blvd,Los Angeles,90049,34.054000,-118.464759,0,1,1
blue-moon-lounge-montrose,BLUE MOON LOUNGE,3509 N VERDUGO RD,blue-moon-lounge-montrose,Blue Moon Lounge,$,lounges,sportsbars,False,3509 N Verdugo Rd,Montrose,91208,34.202984,-118.226860,1,1,2
brü-haus-los-angeles-2,BRU HAUS,11831 WILSHIRE BLVD,brü-haus-los-angeles-2,BRÜ HAUS,$$,gastropubs,lounges,False,11831 Wilshire Blvd,Los Angeles,90025,34.047770,-118.463540,0,1,1
batterfish-los-angeles,BATTERFISH,16200 VENTURA BLVD,batterfish-los-angeles,BatterFish,$,seafood,fishnchips,True,16200 Ventura Blvd,Los Angeles,91436,34.156151,-118.485573,1,1,2
blizz-frozen-yogurt-sherman-oaks,BLIZZ YOGURT,14439 BUBANK BLVD,blizz-frozen-yogurt-sherman-oaks,Blizz Frozen Yogurt,$,icecream,juicebars,False,14439 Burbank Blvd,Sherman Oaks,91401,34.172135,-118.447564,0,0,0
710-grille-san-pedro,710 GRILLE,710 S WEYMOUTH AVE,710-grille-san-pedro,710 Grille,$$,burgers,salad,True,710 S Weymouth Ave,San Pedro,90732,33.737278,-118.309494,1,1,2
cinco-de-mayo-culver-city-2,5 DE MAYO,11204 WASHINGTON PL,cinco-de-mayo-culver-city-2,Cinco De Mayo,$,mexican,mexican,False,11204 Washington Pl,Culver City,90230,34.008403,-118.414024,0,1,1


## <span style="color:teal"> YELP API REQUEST - REVIEWS </span>

### III. Get Yelp data using Business ID and Requests

Using Yelp's requests, I'm going to search by `bus_id`, already gathered from the Yelp API in step I. I'll sort on reviews (that's the `'?sort_by=date_desc'` portion of the query), so I can gather the first and last 20 reviews.

In [71]:
recent_ratings = {}
for bus_id in yelp_df['bus_id']:
    values = []
    values.append(bus_id)
    url_recent = 'https://www.yelp.com/biz/{}?sort_by=date_desc'.format(bus_id)
    rqst = requests.request('GET', url_recent, headers=headers)
    content = rqst.content
    beg_review = content.split('<script type="application/ld+json">')[1].split(', "servesCuisine":')[0] + '}'
    json_reviews = json.loads(beg_review)
    review_rating = json_reviews['aggregateRating']['ratingValue']
    review_count = json_reviews['aggregateRating']['reviewCount']
    values.append(review_rating)
    for review in json_reviews['review']:
        values.append(review['author'])
        values.append(review['datePublished'])
        values.append(review['reviewRating']['ratingValue'])
    recent_ratings[bus_id] = values

In [72]:
columns_tuples = [('author_'+str(i), 'post_date_'+str(i), 'author_rating_'+str(i)) for i in range(20)]
individual_columns = [col for tup in columns_tuples for col in tup]

In [73]:
columns = ['bus_id', 'avg_rating'] + individual_columns
new_reviews_df = pd.DataFrame(recent_ratings.values(), 
                          index = recent_ratings.keys(), 
                          columns = columns)

In [74]:
oldest_ratings = {}
for bus_id in yelp_df['bus_id']:
    values = []
    values.append(bus_id)
    url_recent = 'https://www.yelp.com/biz/{}?sort_by=date_asc'.format(bus_id)
    rqst = requests.request('GET', url_recent, headers=headers)
    content = rqst.content
    beg_review = content.split('<script type="application/ld+json">')[1].split(', "servesCuisine":')[0] + '}'
    json_reviews = json.loads(beg_review)
    values.append(review_rating)
    for review in json_reviews['review']:
        values.append(review['author'])
        values.append(review['datePublished'])
        values.append(review['reviewRating']['ratingValue'])
    oldest_ratings[bus_id] = values

In [75]:
columns_tuples = [('author_'+str(i), 'post_date_'+str(i), 'author_rating_'+str(i)) for i in range(20,40)]
individual_columns = [col for tup in columns_tuples for col in tup]

In [76]:
columns = ['bus_id', 'avg_rating'] + individual_columns
old_reviews_df = pd.DataFrame(oldest_ratings.values(), 
                          index = oldest_ratings.keys(), 
                          columns = columns)

In [41]:
reviews_df = pd.merge(new_reviews_df, old_reviews_df, on='bus_id')

Unnamed: 0,bus_id,avg_rating_x,author_0,post_date_0,author_rating_0,author_1,post_date_1,author_rating_1,author_2,post_date_2,...,author_rating_36,author_37,post_date_37,author_rating_37,author_38,post_date_38,author_rating_38,author_39,post_date_39,author_rating_39
0,uncle-john-bbq-grill-and-poke-sushi-downey,4.614458,Weso D.,2017-02-18,5,Eddie E.,2017-02-18,5,Star C.,2017-02-14,...,5.0,Shin-woo Y.,2016-05-05,5.0,Naomi P.,2016-05-11,4.0,Ari P.,2016-05-11,5.0
1,midori-japanese-restaurant-costa-mesa,3.5,John B.,2007-03-06,2,Jennifer S.,2007-02-27,5,,,...,,,,,,,,,,
2,zuzu-chicken-glendora,4.160448,Puni A.,2017-02-10,5,Julie G.,2017-02-05,4,Jennifer A.,2017-02-01,...,4.0,Shanda T.,2010-05-25,4.0,Kianna F.,2010-06-24,3.0,Wanda W.,2010-07-02,5.0
3,tacos-la-potranka-watts-2,4.428571,Carlos P.,2016-08-06,3,Tania O.,2016-05-27,5,Angel Z.,2016-01-23,...,,,,,,,,,,
4,icuisine-pomona,3.874214,Noel M.,2017-02-04,1,Jessica C.,2017-01-23,5,Tina T.,2017-01-19,...,4.0,M i.,2013-10-09,2.0,Andrew M.,2013-11-26,5.0,Charlie C.,2013-12-12,5.0


"Yelp's default sort order shows reviews that help consumers make informed decisions. The order is determined by recency, user voting, and other review quality factors, which is why an older review may appear before a newer one. To personalize the experience for each individual user, we'll favor reviews from your friends and the users you follow. You do have the option to sort the reviews in a few other ways: by date, star rating, and those written by Elites."

In [44]:
yelp_data_reviews_df = pd.merge(yelp_df, reviews_df, on='bus_id')

In [45]:
yelp_data_reviews_df

Unnamed: 0,bus_id,name,price,avg_rating_x,review_count,cat_1,cat_2,closed,address,city,...,author_rating_36,author_37,post_date_37,author_rating_37,author_38,post_date_38,author_rating_38,author_39,post_date_39,author_rating_39
0,uncle-john-bbq-grill-and-poke-sushi-downey,Uncle John Bbq Grill & Poke Sushi,$,4.5,83,sushi,bbq,False,9952 Lakewood Blvd,Downey,...,5.0,G L.,2016-11-18,5.0,William A.,2016-11-11,5.0,William R.,2016-11-11,5.0
1,midori-japanese-restaurant-costa-mesa,Midori Japanese Restaurant,$$,3.5,2,japanese,japanese,True,2969 Fairview Rd,Costa Mesa,...,,,,,,,,,,
2,zuzu-chicken-glendora,Zuzu Chicken,$,4.0,268,greek,mediterranean,False,1808 E Rt 66,Glendora,...,2.0,Jesse V.,2016-09-03,1.0,Edgar V.,2016-08-30,5.0,E. C.,2016-08-20,5.0
3,icuisine-pomona,iCuisine,$$,4.0,159,persian,cafes,False,3131 N Garey Ave,Pomona,...,5.0,Aria K.,2016-11-01,2.0,Ali M.,2016-10-17,5.0,Alaleh K.,2016-10-16,1.0
4,tacos-la-potranka-watts-2,Tacos La Potranka,$,4.5,7,mexican,foodtrucks,False,2500 E 115th Pl,Watts,...,,,,,,,,,,


In [119]:
url = 'https://api.yelp.com/v3/businesses/search/phone'
headers = {'Authorization': 'Bearer {}'.format(bearer_token)}

In [129]:
phone_nums = ['+13108760286', '+12134484416']

In [135]:
r_test_dict = {}
unsearchables = []

for i in phone_nums:
    try:
        print requests.request('GET', url, headers=headers, params={'phone': i}).json()
    except:
        unsearchables.append(i)

{u'total': 1, u'businesses': [{u'rating': 4.0, u'is_closed': True, u'review_count': 442, u'name': u'Bucato', u'url': u'https://www.yelp.com/biz/bucato-culver-city?adjust_creative=BJ8h7Y2C0GFj40Qr6iy95g&utm_campaign=yelp_api_v3&utm_medium=api_v3_phone_search&utm_source=BJ8h7Y2C0GFj40Qr6iy95g', u'price': u'$$$', u'coordinates': {u'latitude': 34.0299262734668, u'longitude': -118.383927742011}, u'phone': u'+13108760286', u'image_url': u'https://s3-media3.fl.yelpcdn.com/bphoto/QF0KfwjmIbIykliGQzE02w/o.jpg', u'location': {u'city': u'Culver City', u'display_address': [u'3280 Helms Ave', u'Culver City, CA 90034'], u'country': u'US', u'address2': u'', u'address3': u'', u'state': u'CA', u'address1': u'3280 Helms Ave', u'zip_code': u'90034'}, u'display_phone': u'(310) 876-0286', u'id': u'bucato-culver-city', u'categories': [{u'alias': u'italian', u'title': u'Italian'}]}]}
{u'total': 0, u'businesses': []}


In [131]:
unsearchables

['+13108760286', '+12134484416']

In [134]:
requests.request('GET', url, headers=headers, params={'phone': '+12134484416'})

<Response [200]>