In [6]:
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
import matplotlib.pyplot as plt
import json
from tqdm import tqdm_notebook as tqdm
%matplotlib inline

## First Loading the Yelp Restaurant Dataset

In [11]:
datapath = '../data/'
f = open(datapath + 'yelp_academic_dataset_business.json', 'r')

buslist = []
for i, line in enumerate(f):
    buslist.append(json.loads(line))
    
dfy = pd.DataFrame(buslist).set_index(['business_id'])

## Narrow the dataset to cities in the GTA area

In [12]:
toronto_cities = [x for x in dfy['city'].unique() if 'toronto' in x.lower()] \
                    + [x for x in dfy['city'].unique() if 'etobi' in x.lower()] \
                    + [x for x in dfy['city'].unique() if 'miss' in x.lower()] \
                    + [x for x in dfy['city'].unique() if 'york' in x.lower()] \
                    + [x for x in dfy['city'].unique() if 'etobi' in x.lower()] \
                    + [x for x in dfy['city'].unique() if 'scar' in x.lower()]

df_toronto = dfy[(dfy['city'].isin(toronto_cities)) & (dfy['state'] == 'ON')].copy()
df_toronto.shape

(24453, 14)

## Remove outliers that are outside of expecte long/lat range

By removing names that are not in the 1-99th percentile range of long/lats

In [13]:
LOC_BOUND_UP = 99
LOC_BOUND_DOWN = 1

print("before:", df_toronto.shape)
df_ll = df_toronto[(df_toronto['latitude'] <= np.percentile(df_toronto['latitude'].dropna(), LOC_BOUND_UP)) &
                   (df_toronto['latitude'] >= np.percentile(df_toronto['latitude'].dropna(), LOC_BOUND_DOWN)) &
                   (df_toronto['longitude'] <= np.percentile(df_toronto['longitude'].dropna(), LOC_BOUND_UP)) &
                   (df_toronto['longitude'] >= np.percentile(df_toronto['longitude'].dropna(), LOC_BOUND_DOWN))].copy()              
print("after:", df_ll.shape)

before: (24453, 14)
after: (23482, 14)


## Remove businesses not tagged to be a restaurant

In [14]:
keywords = [
    'restaurant',
    'bars',
    'bakeries'
]

# build large list of lists
all_list = []
for kw in keywords:
    all_list.append([x for x in df_ll['categories'].dropna().unique() if kw in x.lower()])

# merge list of lists
master_list = []
for sublist in all_list:
    for item in sublist:
        if item not in master_list:
            master_list.append(item)

dft = df_ll[df_ll['categories'].isin(master_list)].copy()
dft.shape

(10914, 14)

In [15]:
dft.to_pickle('univ/tor_rest.p')