In [None]:
import json
import pandas as pd
import numpy as np

PATH_BUSINESS = "./yelp_academic_dataset_business.json"
PATH_CHECKIN = "./yelp_academic_dataset_checkin.json"
PATH_REVIEW = "./yelp_academic_dataset_review.json"
PATH_TIP = "./yelp_academic_dataset_tip.json"
PATH_USER = "./yelp_academic_dataset_user.json"

In [None]:
def load_part(path, n_lines=100000, func=lambda item, args: True, args=None):
    print("Now loading", path)
    data = []
    with open(path, "rb") as f:
        for i, line in enumerate(f):
            # https://stackoverflow.com/a/26128151
            byte_to_str = line.decode('utf8')
            item = json.loads(byte_to_str)
            if func(item, args):
                data.append(item)

            if i % (n_lines // 5) == 0:
                print(f"Processed {int(i / n_lines * 100)}% ({i}/{n_lines}), {len(data)} entries so far")
            if i == n_lines:
                break
    print(f"Loaded {len(data)} entries")
    return data

In [None]:
def coord_distance(coord_a, coord_b):
    # haversine formula
    lat_a, lon_a = np.deg2rad(coord_a)
    lat_b, lon_b = np.deg2rad(coord_b)
    d_lon = lon_b - lon_a
    d_lat = lat_b - lat_a
    a = np.sin(d_lat / 2)**2 + np.cos(lat_a)*np.cos(lat_b)*np.sin(d_lon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 3956  # miles
    return c * r

In [None]:
def fn_distance(item, args):
    center, radius = args
    point = (item["latitude"], item["longitude"])
    distance = coord_distance(center, point)
    return distance < radius

def fn_eq(item, args):
    field, target = args
    return item[field] == target

def fn_lt(item, args):
    field, target = args
    return item[field] < target

def fn_gt(item, args):
    field, target = args
    return item[field] > target

def fn_has(item, args):
    field, substring = args
    return substring.lower() in item[field].lower()

In [None]:
COORD_TAMPA = (27.9517, -82.4588)
COORD_PHILADELPHIA = (39.9526, -75.1652)

# data = load_part(PATH_REVIEW, n_lines=100)
# data = load_part(PATH_BUSINESS, n_lines=151000, func=fn_distance, args=(COORD_PHILADELPHIA, 2))
# data = load_part(PATH_BUSINESS, n_lines=151000, func=fn_eq, args=("state", "CA"))
# data = load_part(PATH_BUSINESS, n_lines=151000, func=fn_has, args=("name", "fish"))
# data = load_part(PATH_REVIEW, func=fn_gt, args=("date", "2018-10"))
data = load_part(PATH_REVIEW, func=fn_lt, args=("date", "2005-12"))

In [None]:
df = pd.DataFrame(data)
df