In [1]:
import numpy as np
import pandas as pd
import ast
import json

In [2]:
dat = pd.read_csv("yelp_boston.csv")
dat = dat.dropna().reset_index()
print(dat.shape)

(266, 12)


In [3]:
dat["neighborhood"].unique()

array(['Financial District', 'North End', 'Waterfront', 'East Boston',
       'Downtown', 'South End', 'Beacon Hill', 'Back Bay', 'South Boston',
       'Chinatown', 'Allston/Brighton', 'Charlestown',
       'Kendall Square/MIT', 'Dorchester', 'Teele Square',
       'Jamaica Plain', 'Inman Square', 'Harvard Square', 'Fenway',
       'Mission Hill', 'Porter Square', 'North Cambridge', 'West Roxbury',
       'Coolidge Corner'], dtype=object)

In [4]:
print(len(dat["search category"].unique()))
dat["search category"].value_counts()

18


search category
pizza            20
newamerican      17
sandwiches       17
italian          17
japanese         17
restaurants      16
mexican          16
vietnamese       16
bakeries         16
chinese          16
coffee           16
sushi            16
cafes            16
indpak           12
french           11
thai              9
donuts            9
ethnicmarkets     9
Name: count, dtype: int64

In [5]:
dat['categories_json'] = dat['categories_json'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
dat['categories_json'] = dat['categories_json'].apply(lambda lst: [item[0] for item in lst])
max_categories = max(dat['categories_json'].apply(len))
for i in range(max_categories):
    dat[f'category{i+1}'] = dat['categories_json'].apply(lambda x: x[i] if i < len(x) else None)
dat = dat.drop(columns=['categories_json'])

In [6]:
columns_except_category = [col for col in dat.columns if col != "search category"]
new_dat = pd.DataFrame(columns=columns_except_category + ["search category1", "search category2", "search category3"])

for n in dat["name"].unique():
    d = dat[dat["name"] == n]
    unique_categories = d["search category"].unique().tolist()
    
    while len(unique_categories) < 3:
        unique_categories += [None, None]

    unique_categories = unique_categories[:3]
    row = d.iloc[0][columns_except_category].tolist() + unique_categories
    new_dat.loc[len(new_dat)] = row

dat = new_dat

In [7]:
category_counts = dat[["search category1", "search category2", "search category3"]].stack().value_counts()
print(category_counts)

pizza            19
japanese         17
italian          17
newamerican      17
sandwiches       17
restaurants      16
bakeries         16
mexican          16
chinese          16
sushi            16
cafes            15
coffee           15
vietnamese       14
indpak           12
french           11
thai              9
ethnicmarkets     9
donuts            8
Name: count, dtype: int64


In [8]:
print(dat["rating"].min())
print(dat["rating"].max())
dat["rating"].unique()

3.5
5.0


array([4.5, 4. , 5. , 3.5])

In [9]:
dat["location_json"] = dat["location_json"].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
dat["display_address"] = dat["location_json"].apply(lambda x: ", ".join(x["display_address"]) if isinstance(x, dict) and "display_address" in x else "")
dat.drop(columns=["location_json"], inplace=True)

In [10]:
dat

Unnamed: 0,index,name,url,review_count,rating,snippet_text,neighborhood,latitude,longitude,category1,category2,category3,category4,search category1,search category2,search category3,display_address
0,0,Wheelhouse,http://www.yelp.com/biz/wheelhouse-boston-3,101,4.5,"After going to Wheelhouse, you'll never order ...",Financial District,42.357926,-71.053962,Breakfast & Brunch,Burgers,Sandwiches,,restaurants,sandwiches,,"63 Broad St, Financial District, Boston, MA 02109"
1,1,Tenoch Mexican,http://www.yelp.com/biz/tenoch-mexican-boston,121,4.5,"So it's official, folks. We've finished our sa...",North End,42.363309,-71.051800,Mexican,,,,restaurants,mexican,,"3 Lewis St, North End, Boston, MA 02110"
2,2,O Ya,http://www.yelp.com/biz/o-ya-boston,450,4.5,My husband and I came here to celebrate our 20...,Waterfront,42.351408,-71.056867,Japanese,,,,restaurants,japanese,,"9 E St Pl, Waterfront, Boston, MA 02111"
3,3,Locale,http://www.yelp.com/biz/locale-boston,121,4.5,Truly thankful that we stumbled on Locale whil...,North End,42.365085,-71.053187,Italian,Pizza,,,restaurants,pizza,italian,"352 Hanover St, North End, Boston, MA 02113"
4,4,Neptune Oyster,http://www.yelp.com/biz/neptune-oyster-boston,2487,4.5,Best lobster roll on the whole trip..... and I...,North End,42.363618,-71.056012,Seafood,Live/Raw Food,,,restaurants,,,"63 Salem St, North End, Boston, MA 02113"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,336,South Boston Lithuanian Club,http://www.yelp.com/biz/south-boston-lithuania...,13,4.5,This place is delicious!! The thought of anyth...,South Boston,42.337900,-71.048910,Polish,Ukrainian,Ethnic Food,,ethnicmarkets,,,"368 W Broadway, South Boston, Boston, MA 02127"
213,337,Sate Asian Grill,http://www.yelp.com/biz/sate-asian-grill-boston-2,19,4.0,This place is excellent. I work right down the...,Waterfront,42.351807,-71.057484,Ethnic Food,Korean,,,ethnicmarkets,,,"89 S St, Waterfront, Boston, MA 02111"
214,339,The Shops at Porter,http://www.yelp.com/biz/the-shops-at-porter-ca...,79,4.0,This is like a small (chibi) version of the Ja...,Porter Square,42.387192,-71.118778,Japanese,Ethnic Food,Shopping,,ethnicmarkets,,,"University Hall, 1815 Massachusetts Ave, Porte..."
215,340,Bazaar International Gourmet,http://www.yelp.com/biz/bazaar-international-g...,52,4.0,"The Bazaar at 1432 Beacon Street, Brookline Ma...",Coolidge Corner,42.341065,-71.126470,Ethnic Food,Grocery,,,ethnicmarkets,,,"1432 Beacon St, Coolidge Corner, Brookline, MA..."


In [11]:
dat.columns

Index(['index', 'name', 'url', 'review_count', 'rating', 'snippet_text',
       'neighborhood', 'latitude', 'longitude', 'category1', 'category2',
       'category3', 'category4', 'search category1', 'search category2',
       'search category3', 'display_address'],
      dtype='object')

In [12]:
dat.to_json("yelp_boston_cleaned.json", orient="records", indent=4)
dat.to_json("src/yelp_boston_cleaned.json", orient="records", indent=4)

In [14]:
print(dat["review_count"].min())
print(dat["review_count"].max())
dat["review_count"].quantile([0.25, 0.5, 0.75, 1.0])

7
3519


0.25      53.0
0.50     118.0
0.75     310.0
1.00    3519.0
Name: review_count, dtype: float64