This script retrieves all shops in New York from Open Street Map using OverpassQL.
Results are then filtered to include only shops in Manhattan.
https://wiki.openstreetmap.org/wiki/Key:shop

In [20]:
import requests
import pandas as pd
from geopy.geocoders import Nominatim
import folium
from folium.plugins import HeatMap
import geopandas as gpd

In [None]:
# define the Overpass API query for all shops in New York State
overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = """
[out:json];
area["name"="New York"]["admin_level"="4"];
(
  node["shop"](area);
  way["shop"](area);
  relation["shop"](area);
);
out center;
"""
# send the request to the Overpass API
response = requests.get(overpass_url, params={'data': overpass_query})
data = response.json()

# parse the results
elements = data['elements']
shops = []
for element in elements:
    lat = element['lat'] if element['type'] == 'node' else element['center']['lat']
    lon = element['lon'] if element['type'] == 'node' else element['center']['lon']
    shop_info = {
        'id': element['id'],
        'lat': lat,
        'lon': lon,
        'name': element['tags'].get('name', 'N/A'),
        'shop': element['tags'].get('shop', 'N/A'),
        'city': element['tags'].get('addr:city', 'N/A'),
    }
    shops.append(shop_info)

# convert to DataFrame
df = pd.DataFrame(shops)


# save to CSV
df.to_csv('new_york_state_all_shopssss.csv', index=False)


In [None]:
df = pd.read_csv('new_york_state_all_shopssss.csv')


#reverse geocode to get the borough information
geolocator = Nominatim(user_agent="app_name")
iteration = 0
def print_progress():
    #fix this code
    global iteration
    iteration+=1
    prog = (iteration / len(df))*100
    print(f"progress: {prog:.2f}%")



# add log prints that show the progress of the code
def get_borough(lat, lon):
    print_progress()
    try:
        location = geolocator.reverse((lat, lon), timeout=10)
        if location:
            address = location.raw['address']
            # Extract the borough based on address
            borough = address.get('suburb', address.get('city_district', 'N/A'))
            return borough
    except Exception as e:
        print(f"Error geocoding {lat}, {lon}: {e}")
    return 'N/A'

# apply the reverse geocoding to update the borough
df['borough'] = df.apply(lambda row: get_borough(row['lat'], row['lon']), axis=1)

# save the updated DataFrame to a CSV file
df.to_csv('new_york_state_all_shops_with_borough.csv', index=False)

# group data by geolocation and calculate the count of shops at each location
location_counts = df.groupby(['lat', 'lon']).size().reset_index(name='count')

# create a map centered around New York City
m = folium.Map(location=[40.7128, -74.0060], zoom_start=10)

df = df[df['borough'] == 'Manhattan']

HeatMap(location_counts[['lat', 'lon', 'count']].values.tolist()).add_to(m)

# save the map to an HTML file
m.save('new_york_shops_heatmap.html')

#display the map 
m


In [5]:
sectors = ['Others', 'Food, beverages', 'Fitness and wellness',
       'Furniture and interior', 'Electronics', 'Art, music, hobbies',
       'Financial services', 'Stationery, gifts, books, newspapers',
       'General store, department store, mall',
       'Do-it-yourself, household, building materials, gardening',
       'Health and beauty', 'Clothing, shoes, accessories',
       'Outdoors and sport, vehicles', 'Hospitality',
       'Recreation and parks']

In [7]:
df = pd.read_csv('new_york_state_all_shops_with_borough.csv')
df = df[df['borough'] == 'Manhattan']


In [None]:
#get unique list of shops
shops = df['shop'].unique()


In [11]:
shop_to_sector = {
    'gift': 'Stationery, gifts, books, newspapers',
    'books': 'Stationery, gifts, books, newspapers',
    'hairdresser': 'Health and beauty',
    'tattoo': 'Health and beauty',
    'mobile_phone': 'Electronics',
    'beauty': 'Health and beauty',
    'beverages': 'Food, beverages',
    'massage': 'Health and beauty',
    'bakery': 'Food, beverages',
    'bicycle': 'Outdoors and sport, vehicles',
    'supermarket': 'General store, department store, mall',
    'chemist': 'Health and beauty',
    'clothes': 'Clothing, shoes, accessories',
    'car': 'Outdoors and sport, vehicles',
    'jewelry': 'Clothing, shoes, accessories',
    'cosmetics': 'Health and beauty',
    'stationery': 'Stationery, gifts, books, newspapers',
    'furniture': 'Furniture and interior',
    'electronics': 'Electronics',
    'vacant': 'Others',
    'shoes': 'Clothing, shoes, accessories',
    'video_games': 'Art, music, hobbies',
    'athletic': 'Outdoors and sport, vehicles',
    'houseware': 'Do-it-yourself, household, building materials, gardening',
    'greengrocer': 'Food, beverages',
    'ticket': 'Others',
    'convenience': 'Food, beverages',
    'art': 'Art, music, hobbies',
    'mall': 'General store, department store, mall',
    'copyshop': 'Others',
    'ice_cream': 'Food, beverages',
    'cannabis': 'Health and beauty',
    'department_store': 'General store, department store, mall',
    'dry_cleaning': 'Others',
    'florist': 'Do-it-yourself, household, building materials, gardening',
    'optician': 'Health and beauty',
    'variety_store': 'General store, department store, mall',
    'toys': 'Art, music, hobbies',
    'fashion_accessories': 'Clothing, shoes, accessories',
    'bag': 'Clothing, shoes, accessories',
    'yes': 'Others',
    'newsagent': 'Stationery, gifts, books, newspapers',
    'laundry': 'Others',
    'pet': 'Others',
    'alcohol': 'Food, beverages',
    'outdoor': 'Outdoors and sport, vehicles',
    'deli': 'Food, beverages',
    'butcher': 'Food, beverages',
    'hearing_aids': 'Health and beauty',
    'hookah': 'Others',
    'dairy': 'Food, beverages',
    'seafood': 'Food, beverages',
    'photo': 'Others',
    'wine': 'Food, beverages',
    'sports': 'Outdoors and sport, vehicles',
    'shoe_repair': 'Others',
    'tobacco': 'Others',
    'coffee': 'Food, beverages',
    'medical_supply': 'Health and beauty',
    'nail_salon': 'Health and beauty',
    'dry_cleaners;laundry': 'Others',
    'pasta': 'Food, beverages',
    'frame': 'Art, music, hobbies',
    'home_goods': 'Furniture and interior',
    'Video Production': 'Others',
    'doityourself': 'Do-it-yourself, household, building materials, gardening',
    'kitchen': 'Furniture and interior',
    'bed': 'Furniture and interior',
    'pastry': 'Food, beverages',
    'piercing': 'Health and beauty',
    'charity': 'Others',
    'telecommunication': 'Electronics',
    'interior_decoration': 'Furniture and interior',
    'garden_centre': 'Do-it-yourself, household, building materials, gardening',
    'boutique': 'Clothing, shoes, accessories',
    'locksmith': 'Do-it-yourself, household, building materials, gardening',
    'shipping': 'Others',
    'hardware': 'Do-it-yourself, household, building materials, gardening',
    'trade': 'Do-it-yourself, household, building materials, gardening',
    'e-bike': 'Outdoors and sport, vehicles',
    'wholesale': 'Others',
    'appliance': 'Electronics',
    'cheese': 'Food, beverages',
    'herbalist': 'Health and beauty',
    'food': 'Food, beverages',
    'lighting': 'Furniture and interior',
    'flooring': 'Furniture and interior',
    'tailor': 'Clothing, shoes, accessories',
    'leather': 'Clothing, shoes, accessories',
    'storage_rental': 'Others',
    'motorcycle': 'Outdoors and sport, vehicles',
    'video': 'Art, music, hobbies',
    'tea': 'Food, beverages',
    'games': 'Art, music, hobbies',
    'second_hand': 'Others',
    'confectionery': 'Food, beverages',
    'camera': 'Electronics',
    'grocery': 'Food, beverages',
    'craft': 'Art, music, hobbies',
    'perfumery': 'Health and beauty',
    'paint': 'Do-it-yourself, household, building materials, gardening',
    'pet_grooming': 'Others',
    'handbags': 'Clothing, shoes, accessories',
    'kiosk': 'General store, department store, mall',
    'psychic': 'Others',
    'musical_instrument': 'Art, music, hobbies',
    'e-cigarette': 'Others',
    'travel_agency': 'Others',
    'watches': 'Clothing, shoes, accessories',
    'nutrition_supplements': 'Health and beauty',
    'chocolate': 'Food, beverages',
    'carpet': 'Furniture and interior',
    'health_food': 'Food, beverages',
    'baby_goods': 'Clothing, shoes, accessories',
    'religion': 'Others',
    'window_blind': 'Furniture and interior',
    'pawnbroker': 'Financial services',
    'luggage_storage': 'Others',
    'canoe_hire': 'Outdoors and sport, vehicles',
    'funeral_directors': 'Others',
    'bathroom_furnishing': 'Furniture and interior',
    'money_lender': 'Financial services',
    'bridal': 'Clothing, shoes, accessories',
    'spices': 'Food, beverages',
    'music': 'Art, music, hobbies',
    'hifi': 'Electronics',
    'collector': 'Others',
    'car_repair': 'Outdoors and sport, vehicles',
    'hairdresser_supply': 'Health and beauty',
    'antiques': 'Others',
    'sewing': 'Clothing, shoes, accessories',
    'Medical Center': 'Health and beauty',
    'internet_service': 'Electronics',
    'candles': 'Furniture and interior',
    'computer': 'Electronics',
    'fabric': 'Clothing, shoes, accessories',
    'erotic': 'Others',
    'weapons': 'Others',
    'party': 'Others',
    'web_design': 'Others',
    'sunglasses': 'Clothing, shoes, accessories',
    'Bakery': 'Food, beverages',
    'skate': 'Outdoors and sport, vehicles',
    'magic': 'Others',
    'estate_agent': 'Others',
    'nuts': 'Food, beverages',
    'military_surplus': 'Others',
    'coffee;tea': 'Food, beverages',
    'printing': 'Others',
    'postal': 'Others',
    'general': 'General store, department store, mall',
    'noodles': 'Food, beverages',
    'outpost': 'Others',
    'wigs': 'Health and beauty',
    'new_age': 'Others',
    'farm': 'Food, beverages',
    'repair': 'Others',
    'household_linen': 'Furniture and interior',
    'surveillance': 'Others',
    'security': 'Others',
    'mobile_phone_repair': 'Electronics',
    'frozen_food': 'Food, beverages',
    'fast_food': 'Food, beverages',
    'wool': 'Clothing, shoes, accessories',
    'tyres': 'Outdoors and sport, vehicles',
    'beauty_supply': 'Health and beauty',
    'motorcycle_repair': 'Outdoors and sport, vehicles',
    'print_shop': 'Others',
    'esoteric': 'Others',
    'pet_groomer': 'Others'
}


In [12]:
#map shops to sectors
df['sector'] = df['shop'].apply(lambda x: shop_to_sector.get(x, 'Others'))

In [15]:
#drop id, city, borough columns
df = df.drop(['id', 'city', 'borough'], axis=1)


In [17]:
#reset index
df = df.reset_index(drop=True)

In [18]:
df.head()

Unnamed: 0,lat,lon,name,shop,sector
0,40.738716,-73.982473,SVA Campus Store,gift,"Stationery, gifts, books, newspapers"
1,40.736926,-73.989601,Barnes & Noble,books,"Stationery, gifts, books, newspapers"
2,40.726611,-73.990173,Random Accessories,gift,"Stationery, gifts, books, newspapers"
3,40.748699,-73.976186,Dazzle Beauty Salon,hairdresser,Health and beauty
4,40.727268,-73.990374,The Hidden Rose,tattoo,Health and beauty


In [19]:
df.shape

(6360, 5)

In [24]:
#assign taxi zones to locations.
# Load GeoJSON data into a GeoDataFrame
def load_geojson_gpd(filepath):
    return gpd.read_file(filepath)

# Function to find zones using spatial join in geopandas
def assign_zones(df, gdf):
    # Convert DataFrame to GeoDataFrame
    gdf_points = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat))
    gdf_points.set_crs(gdf.crs, inplace=True)

    # Spatial join points to polygons
    joined = gpd.sjoin(gdf_points, gdf, how="left", predicate='within')
    return joined['location_id']

In [25]:
# Load GeoJSON data into a GeoDataFrame
geo_df = load_geojson_gpd('./NYC Taxi Zones.geojson')

# Assign zones.
df['zone_id'] = assign_zones(df, geo_df)

df.head()

Unnamed: 0,lat,lon,name,shop,sector,zone_id
0,40.738716,-73.982473,SVA Campus Store,gift,"Stationery, gifts, books, newspapers",107
1,40.736926,-73.989601,Barnes & Noble,books,"Stationery, gifts, books, newspapers",234
2,40.726611,-73.990173,Random Accessories,gift,"Stationery, gifts, books, newspapers",79
3,40.748699,-73.976186,Dazzle Beauty Salon,hairdresser,Health and beauty,170
4,40.727268,-73.990374,The Hidden Rose,tattoo,Health and beauty,79


In [30]:
our_zones = set(['100',
 '107',
 '113',
 '114',
 '116',
 '12',
 '120',
 '125',
 '127',
 '128',
 '13',
 '137',
 '140',
 '141',
 '142',
 '143',
 '144',
 '148',
 '151',
 '152',
 '153',
 '158',
 '161',
 '162',
 '163',
 '164',
 '166',
 '170',
 '186',
 '194',
 '202',
 '209',
 '211',
 '224',
 '229',
 '230',
 '231',
 '232',
 '233',
 '234',
 '236',
 '237',
 '238',
 '239',
 '24',
 '243',
 '244',
 '246',
 '249',
 '261',
 '262',
 '263',
 '4',
 '41',
 '42',
 '43',
 '45',
 '48',
 '50',
 '68',
 '74',
 '75',
 '79',
 '87',
 '88',
 '90'])

In [28]:
all_zones = df['zone_id'].unique()
#change type to str
all_zones = [str(x) for x in all_zones]
all_zones = set(all_zones)


In [31]:
zones_to_remove = all_zones - our_zones 
zones_to_remove

{'103', '33'}

In [32]:
#remove rows in df that have zone ids in the zones_to_remove set
df = df[~df['zone_id'].isin(zones_to_remove)]

In [34]:
df.shape

(6358, 6)

In [35]:
#export to csv
df.to_csv('osm_manhattan_shops_zones_sectors.csv', index=False)