## Geocoding Parking Violations with Mapbox
This cell samples 60,000 NYC parking violations, builds clean address strings, and geocodes them to latitude/longitude using the Mapbox API.
To avoid geocoding the same address multiple times, it maintains a local geocode_cache.json that stores results.
Coordinates are filtered to NYC bounds and exported.

In [12]:
import pandas as pd
from mapbox import Geocoder
from tqdm import tqdm
import time
import os
import json

# ----------------------------------------
# CONFIGURATION
# ----------------------------------------
MAPBOX_TOKEN = "pk.eyJ1IjoiYXlhZmFoaW0iLCJhIjoiY21haDI0NzNtMDZnYjJrc2did2ozb2diMSJ9.ucfbzSq_1BEtyk7jqGJb1g"
CSV_PATH = "data/nyc_parking_violations_sample.csv"
CACHE_PATH = "geocode_cache.json"
BACKUP_PATH = "geocode_cache_backup.json"
SAVE_EVERY = 1000
SLEEP_TIME = 0.05
SAMPLE_SIZE = 60000 

# ----------------------------------------
# LOAD & CLEAN DATA
# ----------------------------------------
cols = ['house_number', 'street_name', 'violation_county']
df_raw = pd.read_csv(CSV_PATH, low_memory=False)
clean_df = df_raw.dropna(subset=cols).copy()

# Clean up address parts
clean_df['house_number'] = clean_df['house_number'].fillna('').astype(str).str.strip()
clean_df['street_name'] = clean_df['street_name'].fillna('').astype(str).str.strip()
clean_df['violation_county'] = clean_df['violation_county'].fillna('').astype(str).str.strip()

# Build full address
clean_df['full_address'] = (
    clean_df['house_number'] + " " +
    clean_df['street_name'] + ", " +
    clean_df['violation_county'] + ", NYC"
)

clean_df['full_address'] = clean_df['full_address'].str.replace('^\\s+', '', regex=True)
clean_df = clean_df[clean_df['full_address'].str.len() > 10]


# Shuffle to randomize address selection
clean_df = clean_df.sample(frac=1, random_state=42)

# Select 60k fines with unique addresses (1 per address)
seen_addresses = set()
selected_rows = []

for _, row in clean_df.iterrows():
    addr = row['full_address']
    if addr not in seen_addresses:
        selected_rows.append(row)
        seen_addresses.add(addr)
    if len(seen_addresses) >= SAMPLE_SIZE:
        break

df = pd.DataFrame(selected_rows)
print(f"📦 Selected {len(df)} fines with {len(df['full_address'].unique())} unique addresses")

# ----------------------------------------
# LOAD OR CREATE CACHE
# ----------------------------------------
if os.path.exists(CACHE_PATH):
    with open(CACHE_PATH, "r") as f:
        cache = json.load(f)
else:
    cache = {}

# ----------------------------------------
# SETUP GEOCODER
# ----------------------------------------
geocoder = Geocoder(access_token=MAPBOX_TOKEN)

# Filter uncached addresses
unique_addresses = df['full_address'].unique()
uncached = [addr for addr in unique_addresses if addr not in cache]
print(f"📍 Geocoding {len(uncached):,} new addresses...")

# ----------------------------------------
# GEOCODE LOOP
# ----------------------------------------
for i, addr in enumerate(tqdm(uncached)):
    try:
        response = geocoder.forward(addr, limit=1)
        features = response.geojson().get('features', [])
        if features:
            coords = features[0]['geometry']['coordinates']
            cache[addr] = [coords[1], coords[0]]  # lat, lon
        else:
            cache[addr] = [None, None]
    except Exception:
        cache[addr] = [None, None]
    time.sleep(SLEEP_TIME)

    # Save checkpoint
    if (i + 1) % SAVE_EVERY == 0:
        with open(CACHE_PATH, "w") as f:
            json.dump(cache, f)
        with open(BACKUP_PATH, "w") as f:
            json.dump(cache, f)
        print(f"💾 Checkpoint saved at {i+1} geocoded")

# Final cache save
with open(CACHE_PATH, "w") as f:
    json.dump(cache, f)
with open(BACKUP_PATH, "w") as f:
    json.dump(cache, f)
print("✅ Final geocode cache saved.")

# ----------------------------------------
# APPLY GEOCOORDINATES TO SAMPLE
# ----------------------------------------
df[['lat', 'lon']] = df['full_address'].apply(lambda x: pd.Series(cache.get(x, [None, None])))
df = df.dropna(subset=['lat', 'lon'])
df = df[(df['lat'].between(40.49, 40.92)) & (df['lon'].between(-74.26, -73.68))]

# ----------------------------------------
# MERGE COORDS INTO FULL ORIGINAL DATA
# ----------------------------------------
df_raw['full_address'] = (
    df_raw['house_number'].astype(str) + " " +
    df_raw['street_name'] + ", " +
    df_raw['violation_county'] + ", NYC"
)

df_raw['full_address'] = (
    df_raw['house_number'].astype(str) + " " +
    df_raw['street_name'] + ", " +
    df_raw['violation_county'] + ", NYC"
)

df_raw[['lat', 'lon']] = df_raw['full_address'].apply(lambda x: pd.Series(cache.get(x, [None, None])))

# Filter only valid NYC coordinates
final_df = df_raw.dropna(subset=['lat', 'lon'])
final_df = final_df[(final_df['lat'].between(40.49, 40.92)) & (final_df['lon'].between(-74.26, -73.68))]

final_df.to_csv("full_geocoded_parking_fines.csv", index=False)
print(f"✅ Final dataset saved with {len(final_df):,} geocoded rows.")

📦 Selected 59784 fines with 59784 unique addresses
📍 Geocoding 0 new addresses...


0it [00:00, ?it/s]


✅ Final geocode cache saved.
✅ Final dataset saved with 128,798 geocoded rows.


# Prepare 2020 NTA Population Data

We take U.S. Census population data by tract and use a crosswalk file to map each tract to a 2020 NTA (Neighborhood Tabulation Area). Then we sum the populations within each NTA and save the total population per NTA for later use.



In [13]:
import pandas as pd

# Load Census tract population
pop_df = pd.read_csv("data/nyc_census_tract_pop_2020.csv")
pop_df.rename(columns={'P1_001N': 'Population', 'tract': 'CT2020'}, inplace=True)
pop_df['Population'] = pd.to_numeric(pop_df['Population'], errors='coerce')
pop_df['CT2020'] = pop_df['CT2020'].astype(str).str.zfill(6)  # Ensure leading zeros

# Load tract-to-NTA crosswalk
crosswalk = pd.read_csv("data/2020_Census_Tracts_to_2020_NTAs_and_CDTAs_Equivalency_20250511.csv")
crosswalk['CT2020'] = crosswalk['CT2020'].astype(str).str.zfill(6)

# Merge Census + crosswalk
merged = pop_df.merge(crosswalk, on='CT2020', how='left')

# Group by NTA
nta_pop = (
    merged.groupby(['NTACode', 'NTAName'])
    .agg({'Population': 'sum'})
    .reset_index()
)


# Save for later use
nta_pop.to_csv("data/nta_population_2020.csv", index=False)

# Preview
print(nta_pop.sort_values(by="Population", ascending=False).head())


    NTACode                    NTAName  Population
31   BK1001                  Bay Ridge      331906
9    BK0301  Bedford-Stuyvesant (West)      310028
10   BK0302  Bedford-Stuyvesant (East)      302127
39   BK1202               Borough Park      255691
170  QN0301            Jackson Heights      232256


# Match Parking Fines to Neighborhoods and Calculate Fines Per 1,000 Residents

We spatially join geocoded parking fines to the 2020 NTA boundaries, then combine that with the NTA population data to calculate the number of parking fines per 1,000 residents in each neighborhood. The results are saved for use in maps and visualizations.

In [14]:
import geopandas as gpd
import pandas as pd

# Load geocoded fine data
df = pd.read_csv("full_geocoded_parking_fines.csv", low_memory=False)

# Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df['lon'], df['lat']),
    crs="EPSG:4326"
)

# Load updated 2020 NTA GeoJSON
nta = gpd.read_file("data/nynta2020.geojson").to_crs("EPSG:4326")

# Spatial join: assign each fine to a neighborhood
joined = gpd.sjoin(gdf, nta, how="left", predicate="intersects")

# ✅ Preview matched data
print(joined[['house_number', 'street_name', 'boroname', 'ntaname']].dropna().head())

# Load updated population data
pop_df = pd.read_csv("data/nta_population_2020.csv")
pop_df.rename(columns={'NTAName': 'ntaname', 'NTACode': 'nta2020'}, inplace=True)
pop_df['ntaname'] = pop_df['ntaname'].str.strip()
joined['ntaname'] = joined['ntaname'].str.strip()

# Merge fines with population
merged = joined.merge(pop_df, on='ntaname', how='left')

# Group by NTA and calculate fines per 1,000 residents
nta_stats = (
    merged.groupby(['ntaname', 'ntaabbrev', 'boroname', 'Population'])
    .size()
    .reset_index(name='num_fines')
)
nta_stats['fines_per_1000'] = (nta_stats['num_fines'] / nta_stats['Population']) * 1000

# Cleaned version for visualization
nta_cleaned = (
    nta_stats.groupby('ntaname')
    .agg({
        'num_fines': 'sum',
        'Population': 'mean',
        'fines_per_1000': 'mean'
    })
    .reset_index()
    .sort_values(by='fines_per_1000', ascending=False)
)

# Save cleaned stats
nta_cleaned.to_csv("data/nta_fine_stats_cleaned.csv", index=False)

# Save GeoJSON with stats for choropleth mapping
geo_merged = nta.merge(nta_cleaned, on='ntaname', how='left')
geo_merged.to_file("data/nta_with_fine_stats.geojson", driver="GeoJSON")


  house_number     street_name   boroname                       ntaname
0           51       E 44TH ST  Manhattan          Midtown-Times Square
1        39-41         60TH ST     Queens                      Woodside
2       126 05          36 AVE     Queens                 College Point
3           41      SEAVER WAY     Queens  Flushing Meadows-Corona Park
4        46-50  BURLING STREET     Queens                 East Flushing
