In [1]:
import requests
import os
import json
import pandas as pd

In [2]:
from yelp_config import api_key

In [3]:
raw = pd.read_csv('../data/yelp_academic_dataset_review.csv')

In [4]:
raw = raw.dropna(how='any')

In [5]:
raw.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,iCQpiavjjPzJ5_3gPD5Ebg,0,2011-02-25,0.0,x7mDIiDB3jEiPGPHOmDzyw,2.0,The pizza was okay. Not the best I've had. I p...,0.0,msQe1u7Z_XuqjGoqhB0J5g
1,pomGBqfbxcqPv14c3XH-ZQ,0,2012-11-13,0.0,dDl8zu1vWPdKGihJrwQbpw,5.0,I love this place! My fiance And I go here atl...,0.0,msQe1u7Z_XuqjGoqhB0J5g
2,jtQARsP6P-LbkyjbO1qNGg,1,2014-10-23,1.0,LZp4UX5zK3e-c5ZGSeo3kA,1.0,Terrible. Dry corn bread. Rib tips were all fa...,3.0,msQe1u7Z_XuqjGoqhB0J5g
3,elqbBhBfElMNSrjFqW3now,0,2011-02-25,0.0,Er4NBWCmCD4nM8_p1GRdow,2.0,Back in 2005-2007 this place was my FAVORITE t...,2.0,msQe1u7Z_XuqjGoqhB0J5g
4,Ums3gaP2qM3W1XcA5r6SsQ,0,2014-09-05,0.0,jsDu6QEJHbwP2Blom1PLCA,5.0,Delicious healthy food. The steak is amazing. ...,0.0,msQe1u7Z_XuqjGoqhB0J5g


In [6]:
busgroup = raw.groupby('business_id').count()

In [7]:
busgroup = busgroup.sort_values('text', ascending=False)

In [8]:
busgroup.head()

Unnamed: 0_level_0,cool,date,funny,review_id,stars,text,useful,user_id
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4JNXUYY8wbaaDmk3BPzlWw,7968,7968,7968,7968,7968,7968,7968,7968
RESDUcs7fIiihp38-d6_6g,7861,7861,7861,7861,7861,7861,7861,7861
K7lWdNUhCbcnEvI0NhGewg,6447,6447,6447,6447,6447,6447,6447,6447
cYwJA2A6I12KNkm2rtXd5g,5472,5472,5472,5472,5472,5472,5472,5472
f4x1YBxkLrZg652xt2KR5g,5382,5382,5382,5382,5382,5382,5382,5382


In [9]:
top_businesses = busgroup[['text']][:21]
top_businesses = top_businesses.rename(columns={'text': 'review_count'})
top_businesses

Unnamed: 0_level_0,review_count
business_id,Unnamed: 1_level_1
4JNXUYY8wbaaDmk3BPzlWw,7968
RESDUcs7fIiihp38-d6_6g,7861
K7lWdNUhCbcnEvI0NhGewg,6447
cYwJA2A6I12KNkm2rtXd5g,5472
f4x1YBxkLrZg652xt2KR5g,5382
DkYS3arLOhA8si5uUEmHOw,4981
2weQS-RnoOBhb1KsHKyoSQ,4240
5LNZ67Yw9RD6nf4_UhXOjw,4097
ujHiaprwCQ5ewziu0Vi9rw,4089
iCQpiavjjPzJ5_3gPD5Ebg,4078


In [10]:
headers = {'Authorization': f'bearer {api_key}'}
bus_url = lambda bid: f'https://api.yelp.com/v3/businesses/{bid}'

In [11]:
jsonpath = '../data/top_21_businesses.json'
if os.path.exists(jsonpath):
    top_business_json = json.load(open(jsonpath))
else:
    top_business_json = []
    for bid, text in top_businesses.iterrows():
        response = requests.get(bus_url(bid), headers=headers).json()
        top_business_json.append(response)
        with open(jsonpath, 'w') as writefile:
            json.dump(top_business_json, writefile)

In [40]:
top_data_df = pd.DataFrame(top_business_json)
top_data_df.rename(columns={'id': 'business_id'}, inplace=True)
[col for col in top_data_df]

['alias',
 'categories',
 'coordinates',
 'display_phone',
 'hours',
 'business_id',
 'image_url',
 'is_claimed',
 'is_closed',
 'location',
 'name',
 'phone',
 'photos',
 'price',
 'rating',
 'review_count',
 'transactions',
 'url']

In [41]:
top_data_df = top_data_df[['business_id', 'name', 'price', 'rating', 'categories',]]

In [42]:
top_data_df['categories'] = top_data_df['categories'].map(lambda x: [y['title'] for y in x])

In [43]:
top_data_df = top_data_df.join(top_businesses, on='business_id')

In [44]:
top_data_df.head()

Unnamed: 0,business_id,name,price,rating,categories,review_count
0,4JNXUYY8wbaaDmk3BPzlWw,Mon Ami Gabi,$$,4.0,"[French, Steakhouses, Breakfast & Brunch]",7968
1,RESDUcs7fIiihp38-d6_6g,Bacchanal Buffet,$$$,4.0,[Buffets],7861
2,K7lWdNUhCbcnEvI0NhGewg,Wicked Spoon,$$$,3.5,"[Buffets, Breakfast & Brunch]",6447
3,cYwJA2A6I12KNkm2rtXd5g,Gordon Ramsay BurGR,$$,4.0,"[Burgers, American (Traditional)]",5472
4,f4x1YBxkLrZg652xt2KR5g,Hash House A Go Go,$$,4.0,"[American (New), Breakfast & Brunch]",5382


In [45]:
all_data_top_df = raw.loc[raw['business_id'].isin(top_data_df['business_id'])]

In [46]:
with_names_df = all_data_top_df.merge(top_data_df, on='business_id')

In [47]:
with_names_df.count()

business_id     94943
cool            94943
date            94943
funny           94943
review_id       94943
stars           94943
text            94943
useful          94943
user_id         94943
name            94943
price           91459
rating          94943
categories      94943
review_count    94943
dtype: int64

In [48]:
with_names_df.dropna(inplace=True)

In [49]:
with_names_df.count()

business_id     91459
cool            91459
date            91459
funny           91459
review_id       91459
stars           91459
text            91459
useful          91459
user_id         91459
name            91459
price           91459
rating          91459
categories      91459
review_count    91459
dtype: int64

In [50]:
with_names_df.to_csv('../data/top_21_businesses_pre_vader.csv', index=False, sep='\t')

In [51]:
[col for col in with_names_df]

['business_id',
 'cool',
 'date',
 'funny',
 'review_id',
 'stars',
 'text',
 'useful',
 'user_id',
 'name',
 'price',
 'rating',
 'categories',
 'review_count']