In [37]:
import pandas as pd
import numpy as np
import json
import os
import time
from tqdm import tqdm

pd.set_option('display.max_columns', 30)

CITY = 'Los Angeles'
CACHED_BYSGNN_DATA_PATH = './data/data-0-400-400-Los Angeles.pkl'

In [28]:
class stat_collector:
    def __init__(self):
        self.parquet_file_count=0
        self.data_record_count = 0
        self.memory_usage_in_GB = 0		#gives an estimate of the total RAM usage if all files were read into memory at the same time.
        self.unique_device_count = 0
        self.avg_pos_acc = 0
        self.starting_time = time.process_time()
        self.elapsed_time = time.process_time()
        self.unique_geohash_count = 0

def load_poi_db(city):
    poi_folder = "/storage/dataset/poi_haowen/CoreRecords-CORE_POI-2019_03-2020-03-25/"
    poi_columns = ["safegraph_place_id", "parent_safegraph_place_id", "location_name", "safegraph_brand_ids", "brands",
                   "top_category", "sub_category", "naics_code", "latitude", "longitude", "street_address", "city",
                   "region", "postal_code", "iso_country_code", "phone_number", "open_hours", "category_tags"]
    files = os.listdir(poi_folder)


    poi_s = stat_collector()
    poi_db = pd.DataFrame(columns=poi_columns)
    for f in files:
        if f[-3:] == 'csv' and 'brand' not in f:
            print(f)
            df = pd.read_csv(poi_folder + f)
            df = df.loc[df['city']==city]
            poi_db = pd.concat([poi_db, df], ignore_index=True, sort=False)
            poi_s.memory_usage_in_GB += df.memory_usage(deep=True).sum() / 1000000000
            poi_s.data_record_count += df.shape[0]
            poi_s.parquet_file_count += 1
    return poi_db, poi_s

In [38]:
poi_db, poi_s = load_poi_db(CITY)
cached_bysgnn_data = pd.read_pickle(CACHED_BYSGNN_DATA_PATH)
poi_db.head(3)

core_poi-part2.csv
core_poi-part5.csv
core_poi-part4.csv
core_poi-part3.csv
core_poi-part1.csv


Unnamed: 0,safegraph_place_id,parent_safegraph_place_id,location_name,safegraph_brand_ids,brands,top_category,sub_category,naics_code,latitude,longitude,street_address,city,region,postal_code,iso_country_code,phone_number,open_hours,category_tags
0,sg:0e98c66c5cdd48f6a704571cfeeec2a9,,Aviv's Guitar Lessons,,,"Sporting Goods, Hobby, and Musical Instrument ...",Musical Instrument and Supplies Stores,451140.0,34.040672,-118.318678,2526 W 18th St,Los Angeles,CA,90019,US,13234240000.0,,
1,sg:11770486b85e42cf94bceea7973bcaae,,Food,,,Restaurants and Other Eating Places,Full-Service Restaurants,722511.0,34.044072,-118.421335,10571 W Pico Blvd,Los Angeles,CA,90064,US,13104420000.0,"{ ""Mon"": [[""7:00"", ""19:00""]], ""Tue"": [[""7:00"",...","Sandwich Shop,Brunch"
2,sg:1e31b4bf41c04367aff87760bd6a6963,,Century 21 Real Estate,SG_BRAND_7a4fda2c61e310a6f246892a610ce350,Century 21 Real Estate,Offices of Real Estate Agents and Brokers,Offices of Real Estate Agents and Brokers,531210.0,34.050433,-118.361206,5651 W Pico Blvd Ste 203,Los Angeles,CA,90019,US,,,


In [40]:
# keep the rows from poi_db that are in cached_bysgnn_data in a new dataframe
target_poi_db = poi_db.loc[poi_db['safegraph_place_id'].isin(cached_bysgnn_data['safegraph_place_id'])]
target_poi_db.head(3)


Unnamed: 0,safegraph_place_id,parent_safegraph_place_id,location_name,safegraph_brand_ids,brands,top_category,sub_category,naics_code,latitude,longitude,street_address,city,region,postal_code,iso_country_code,phone_number,open_hours,category_tags
61,sg:14d4def7e85c4ffa857fcf69d731c3e2,,Enterprise Rent-A-Car,SG_BRAND_9f497514e036d97886867c3fe21baf6a,Enterprise Rent-A-Car,Automotive Equipment Rental and Leasing,Passenger Car Rental,532111.0,34.020413,-118.282071,620 Usc Mccarthy Way,Los Angeles,CA,90007,US,12137420000.0,"{ ""Mon"": [[""8:30"", ""17:00""]], ""Tue"": [[""8:30"",...",
138,sg:e808b43b64e840fb8ccd6e25154cdc37,,Marlborough School,,,Elementary and Secondary Schools,Elementary and Secondary Schools,611110.0,34.070075,-118.326624,250 S Rossmore Ave,Los Angeles,CA,90004,US,,,
208,sg:9520c6f2bcef47d39e6189257237665a,,Tutor Hall Cafe,,,Restaurants and Other Eating Places,Full-Service Restaurants,722511.0,34.020083,-118.289823,3710 McClintock Ave,Los Angeles,CA,90089,US,12137400000.0,"{ ""Mon"": [[""7:00"", ""19:00""]], ""Tue"": [[""7:00"",...",Sandwich Shop


In [41]:
from haversine import haversine

def nearest_neighbors(df, lat, lon, n=300):
    # Calculate the distance to every other POI
    df['distance'] = df.apply(lambda row: haversine((lat, lon), (row['latitude'], row['longitude'])), axis=1)
    
    # Sort dataframe by distance and take the top n rows
    neighbors = df.sort_values(by='distance').iloc[1:n+1]
    
    return neighbors

In [50]:
num_nearest_neighbors = 300
result = []

for index, row in tqdm(target_poi_db.iterrows(), total=target_poi_db.shape[0]):
    neighbors = nearest_neighbors(poi_db, row['latitude'], row['longitude'], n=num_nearest_neighbors)
    
    poi_json = {
        "info": {
            "name": row['location_name'],
            "geometry": {
                "coordinates": [row['longitude'], row['latitude']]
            },
            'safegraph_place_id': row['safegraph_place_id'],
        },
        "neighbor_info": {
            "name_list": neighbors['location_name'].tolist(),
            "geometry_list": [{"coordinates": [lon, lat]} for lat, lon in zip(neighbors['latitude'], neighbors['longitude'])]
        }
    }
    
    result.append(poi_json)
result_str = '\n'.join(json.dumps(j) for j in result)

100%|██████████| 394/394 [04:27<00:00,  1.47it/s]


In [51]:
output_path = './data/safegraph_neighborhood_data_{}_{}.json'.format(CITY, num_nearest_neighbors)
# save the result_str to output_path
with open(output_path, 'w') as f:
    f.write(result_str)