# Access4All — OpenStreetMap Accessibility Enrichment (v2)

This notebook collects wheelchair-related points of interest and step infrastructure from OpenStreetMap for the target cities, parses and standardizes the raw OSM snapshots, and spatially joins them with Airbnb listings.

The result is an enriched Airbnb dataset with OSM-based accessibility signals within a 200 m radius, persisted as the v2 OSM features table for downstream scoring.


### Prepare DBFS directory for storing OSM snapshot files


In [0]:
base_dbfs = "dbfs:/FileStore/access4all/osm_snapshot_v2"
dbutils.fs.mkdirs(base_dbfs)
display(dbutils.fs.ls("dbfs:/FileStore/access4all"))


### Fetch wheelchair-related POIs and step infrastructure from OpenStreetMap via Overpass API and store raw city snapshots


In [0]:
import time, json, requests

OVERPASS_URL = "https://overpass-api.de/api/interpreter"

CITIES = [
    "Paris", "Rome", "Dubai", "San Francisco", "São Paulo",
    "Los Angeles", "Rio de Janeiro", "New York", "Las Vegas"
]

QUERY_WHEELCHAIR = r"""
[out:json][timeout:180];
area["name"="{CITY}"]["boundary"="administrative"]->.a;

(
  nwr["wheelchair"~"^(yes|limited|no)$"]["amenity"](area.a);
  nwr["wheelchair"~"^(yes|limited|no)$"]["shop"](area.a);
  nwr["wheelchair"~"^(yes|limited|no)$"]["tourism"](area.a);
  nwr["wheelchair"~"^(yes|limited|no)$"]["leisure"](area.a);
  nwr["wheelchair"~"^(yes|limited|no)$"]["public_transport"](area.a);
  nwr["wheelchair"~"^(yes|limited|no)$"]["building"](area.a);
  nwr["wheelchair"~"^(yes|limited|no)$"]["office"](area.a);
);

out center tags;
"""

QUERY_STEPS = r"""
[out:json][timeout:180];
area["name"="{CITY}"]["boundary"="administrative"]->.a;

(
  way["highway"="steps"](area.a);
  relation["highway"="steps"](area.a);
);

out center tags;
"""

def overpass_request_strict(query: str, retries: int = 6):
    headers = {
        "Accept": "application/json",
        "User-Agent": "Access4All/1.0 (Databricks student project; contact: course staff)"
    }

    last_err = None
    for attempt in range(1, retries + 1):
        try:
            r = requests.post(
                OVERPASS_URL,
                data={"data": query},
                headers=headers,
                timeout=300
            )

            # Common throttle/overload cases
            if r.status_code in (429, 502, 503, 504):
                time.sleep(3 * attempt)
                continue

            # If it's not a 2xx, show some context
            if not (200 <= r.status_code < 300):
                snippet = (r.text or "")[:300].replace("\n", " ")
                raise RuntimeError(f"HTTP {r.status_code}. Body starts: {snippet}")

            # Try JSON parse; if it fails, print snippet and retry
            try:
                data = r.json()
            except Exception:
                snippet = (r.text or "")[:300].replace("\n", " ")
                raise RuntimeError(f"Non-JSON response. Body starts: {snippet}")

            # Basic validation
            if not isinstance(data, dict) or "elements" not in data:
                raise RuntimeError(f"Unexpected JSON structure keys={list(data.keys()) if isinstance(data, dict) else type(data)}")

            return data

        except Exception as e:
            last_err = e
            time.sleep(3 * attempt)

    raise RuntimeError(f"Overpass failed after {retries} attempts. Last error: {last_err}")

# Download and write to DBFS
for city in CITIES:
    for kind, template in [("wheelchair_poi", QUERY_WHEELCHAIR), ("steps", QUERY_STEPS)]:
        q = template.format(CITY=city)
        print(f"Fetching {city} | {kind} ...")
        data = overpass_request_strict(q)

        safe_city = city.replace(" ", "_")
        out_path = f"/dbfs/FileStore/access4all/osm_snapshot_v2/{safe_city}_{kind}.json"
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(data, f)

        print(f"Saved {city:15s} | {kind:14s} | elements={len(data['elements'])}")
        time.sleep(2)


### Re-fetch OSM data for cities with Overpass area-query issues using a bounding box fallback (Dubai)


In [0]:
import time, json

DUBAI_BBOX = (24.80, 54.85, 25.45, 55.65)  # (south, west, north, east)
S, W, N, E = DUBAI_BBOX

QUERY_DUBAI_WHEELCHAIR_BBOX = f"""
[out:json][timeout:180];
(
  nwr["wheelchair"~"^(yes|limited|no)$"]["amenity"]({S},{W},{N},{E});
  nwr["wheelchair"~"^(yes|limited|no)$"]["shop"]({S},{W},{N},{E});
  nwr["wheelchair"~"^(yes|limited|no)$"]["tourism"]({S},{W},{N},{E});
  nwr["wheelchair"~"^(yes|limited|no)$"]["leisure"]({S},{W},{N},{E});
  nwr["wheelchair"~"^(yes|limited|no)$"]["public_transport"]({S},{W},{N},{E});
  nwr["wheelchair"~"^(yes|limited|no)$"]["building"]({S},{W},{N},{E});
  nwr["wheelchair"~"^(yes|limited|no)$"]["office"]({S},{W},{N},{E});
);
out center tags;
"""

QUERY_DUBAI_STEPS_BBOX = f"""
[out:json][timeout:180];
(
  way["highway"="steps"]({S},{W},{N},{E});
  relation["highway"="steps"]({S},{W},{N},{E});
);
out center tags;
"""

print("Re-fetching Dubai (bbox) | wheelchair_poi ...")
data_w = overpass_request_strict(QUERY_DUBAI_WHEELCHAIR_BBOX)
with open("/dbfs/FileStore/access4all/osm_snapshot_v2/Dubai_wheelchair_poi.json", "w", encoding="utf-8") as f:
    json.dump(data_w, f)
print("Saved Dubai | wheelchair_poi | elements=", len(data_w["elements"]))
time.sleep(2)

print("Re-fetching Dubai (bbox) | steps ...")
data_s = overpass_request_strict(QUERY_DUBAI_STEPS_BBOX)
with open("/dbfs/FileStore/access4all/osm_snapshot_v2/Dubai_steps.json", "w", encoding="utf-8") as f:
    json.dump(data_s, f)
print("Saved Dubai | steps | elements=", len(data_s["elements"]))


### Re-fetch OSM data for Rome using a bounding box due to administrative area query limitations


In [0]:
import time, json

ROME_BBOX = (41.70, 12.20, 42.10, 12.80)  # (south, west, north, east)
S, W, N, E = ROME_BBOX

QUERY_ROME_WHEELCHAIR_BBOX = f"""
[out:json][timeout:180];
(
  nwr["wheelchair"~"^(yes|limited|no)$"]["amenity"]({S},{W},{N},{E});
  nwr["wheelchair"~"^(yes|limited|no)$"]["shop"]({S},{W},{N},{E});
  nwr["wheelchair"~"^(yes|limited|no)$"]["tourism"]({S},{W},{N},{E});
  nwr["wheelchair"~"^(yes|limited|no)$"]["leisure"]({S},{W},{N},{E});
  nwr["wheelchair"~"^(yes|limited|no)$"]["public_transport"]({S},{W},{N},{E});
  nwr["wheelchair"~"^(yes|limited|no)$"]["building"]({S},{W},{N},{E});
  nwr["wheelchair"~"^(yes|limited|no)$"]["office"]({S},{W},{N},{E});
);
out center tags;
"""

QUERY_ROME_STEPS_BBOX = f"""
[out:json][timeout:180];
(
  way["highway"="steps"]({S},{W},{N},{E});
  relation["highway"="steps"]({S},{W},{N},{E});
);
out center tags;
"""

print("Re-fetching Rome (bbox) | wheelchair_poi ...")
data_w = overpass_request_strict(QUERY_ROME_WHEELCHAIR_BBOX)
with open("/dbfs/FileStore/access4all/osm_snapshot_v2/Rome_wheelchair_poi.json", "w", encoding="utf-8") as f:
    json.dump(data_w, f)
print("Saved Rome | wheelchair_poi | elements=", len(data_w["elements"]))
time.sleep(2)

print("Re-fetching Rome (bbox) | steps ...")
data_s = overpass_request_strict(QUERY_ROME_STEPS_BBOX)
with open("/dbfs/FileStore/access4all/osm_snapshot_v2/Rome_steps.json", "w", encoding="utf-8") as f:
    json.dump(data_s, f)
print("Saved Rome | steps | elements=", len(data_s["elements"]))


### Parse saved OSM JSON snapshots into a clean, city-scoped Delta table (explicit schema + allowed signals only)


In [0]:
from pyspark.sql import functions as F, types as T

# 0) Drop the table if it exists (prevents schema carryover)
spark.sql("DROP TABLE IF EXISTS access4all_osm_snapshot_9cities")

# 1) Explicit schema: force tags to be a MAP (prevents 'fixme' struct-field collisions)
schema = T.StructType([
    T.StructField("version", T.FloatType(), True),
    T.StructField("generator", T.StringType(), True),
    T.StructField("osm3s", T.StructType([
        T.StructField("timestamp_osm_base", T.StringType(), True),
        T.StructField("copyright", T.StringType(), True),
    ]), True),
    T.StructField("elements", T.ArrayType(
        T.StructType([
            T.StructField("type", T.StringType(), True),
            T.StructField("id", T.LongType(), True),
            T.StructField("lat", T.DoubleType(), True),
            T.StructField("lon", T.DoubleType(), True),
            T.StructField("center", T.StructType([
                T.StructField("lat", T.DoubleType(), True),
                T.StructField("lon", T.DoubleType(), True),
            ]), True),
            T.StructField("tags", T.MapType(T.StringType(), T.StringType()), True),
        ])
    ), True),
])

base = "dbfs:/FileStore/access4all/osm_snapshot_v2"

# 2) Read JSON with the explicit schema
raw = spark.read.schema(schema).json(f"{base}/*.json")

# 3) Explode elements and standardize lat/lon
elements = (
    raw
    .select(F.input_file_name().alias("source_file"), F.explode("elements").alias("e"))
    .select(
        "source_file",
        F.col("e.type").alias("osm_type"),
        F.col("e.id").alias("osm_id"),
        F.col("e.lat").alias("lat_node"),
        F.col("e.lon").alias("lon_node"),
        F.col("e.center.lat").alias("lat_center"),
        F.col("e.center.lon").alias("lon_center"),
        F.col("e.tags").alias("tags")
    )
    .withColumn("lat", F.coalesce("lat_node", "lat_center"))
    .withColumn("lon", F.coalesce("lon_node", "lon_center"))
    .drop("lat_node", "lon_node", "lat_center", "lon_center")
    .filter(F.col("lat").isNotNull() & F.col("lon").isNotNull())
)

# 4) Derive city/kind from filename, whitelist only needed tag keys
osm = (
    elements
    .withColumn("file_base", F.regexp_extract("source_file", r"([^/]+)\.json$", 1))
    .withColumn("kind", F.regexp_extract("file_base", r"(wheelchair_poi|steps)$", 1))
    .withColumn(
        "city",
        F.regexp_replace(
            F.regexp_replace("file_base", r"_(wheelchair_poi|steps)$", ""),
            "_",
            " "
        )
    )
    .drop("file_base")
    .withColumn("wheelchair", F.element_at("tags", F.lit("wheelchair")))
    .withColumn("highway", F.element_at("tags", F.lit("highway")))
    .withColumn("ramp", F.element_at("tags", F.lit("ramp")))
    .drop("tags")  # keep schema tight, avoid any surprise fields
)

# 5) Keep ONLY allowed signals (locked)
osm_allowed = (
    osm
    .filter(
        ((F.col("kind") == "wheelchair_poi") & F.col("wheelchair").isin("yes", "limited", "no")) |
        ((F.col("kind") == "steps") & (F.col("highway") == "steps"))
    )
)

# 6) Write processed snapshot Delta table
osm_allowed.write.mode("overwrite").format("delta").saveAsTable("access4all_osm_snapshot_9cities")

# 7) Sanity counts
display(
    spark.table("access4all_osm_snapshot_9cities")
    .groupBy("city", "kind")
    .count()
    .orderBy("city", "kind")
)


### Install and initialize H3 for spatial indexing


<small>
H3 is a hexagonal spatial indexing system that converts latitude/longitude coordinates into fixed-resolution hexagon cells, enabling efficient spatial joins and spatial aggregation.
</small>


In [0]:
%pip install h3
dbutils.library.restartPython()


### Spatially join Airbnb listings with nearby OSM accessibility signals using H3 (candidate neighbors) and a 200m Haversine distance filter


In [0]:

import h3
from pyspark.sql import functions as F, types as T

# --- H3 UDFs ---
@F.udf(T.StringType())
def h3_res10(lat, lon):
    return h3.latlng_to_cell(float(lat), float(lon), 10)

@F.udf(T.ArrayType(T.StringType()))
def h3_kring2(cell):
    # k=2 is typically enough to cover 200m at H3 resolution 10
    return list(h3.grid_disk(cell, 2))

# --- Haversine distance (meters) ---
def haversine_m(lat1, lon1, lat2, lon2):
    return 2 * 6371000 * F.asin(F.sqrt(
        F.pow(F.sin((F.radians(lat2) - F.radians(lat1)) / 2), 2) +
        F.cos(F.radians(lat1)) * F.cos(F.radians(lat2)) *
        F.pow(F.sin((F.radians(lon2) - F.radians(lon1)) / 2), 2)
    ))

# ---  Load Airbnb + OSM and add H3 res=10 ---
airbnb = (
    spark.table("access4all_airbnb_v1_9cities")
    .select("property_id", "city", "lat", "long")
    .withColumnRenamed("long", "lon")
    .withColumn("h3_10", h3_res10(F.col("lat"), F.col("lon")))
)

osm = (
    spark.table("access4all_osm_snapshot_9cities")
    .select("city", "kind", "osm_type", "osm_id", "lat", "lon", "wheelchair", "highway", "ramp")
    .withColumn("h3_10", h3_res10(F.col("lat"), F.col("lon")))
)

# --- : Spatial candidate join (H3 neighbors) + exact 200m distance filter ---
airbnb_k = (
    airbnb
    .withColumn("h3_candidates", h3_kring2(F.col("h3_10")))
    .withColumn("h3_candidate", F.explode("h3_candidates"))
    .drop("h3_candidates")
)

joined = (
    airbnb_k.alias("a")
    .join(osm.alias("o"), F.col("a.h3_candidate") == F.col("o.h3_10"), "left")
    .filter(F.col("o.osm_id").isNotNull())  # keep only matched OSM candidates
    .withColumn("dist_m", haversine_m(F.col("a.lat"), F.col("a.lon"), F.col("o.lat"), F.col("o.lon")))
    .filter(F.col("dist_m") <= F.lit(200.0))
)




### Aggregate nearby wheelchair-tagged OSM POIs per listing (counts and nearest distance within 200m)


In [0]:
wheel = joined.filter(F.col("o.kind") == "wheelchair_poi")

wheel_agg = (wheel
    .groupBy(F.col("a.property_id").alias("property_id"))
    .agg(
        F.sum(F.when(F.col("o.wheelchair") == "yes", 1).otherwise(0)).alias("osm_poi_wheelchair_yes_count_200m"),
        F.sum(F.when(F.col("o.wheelchair") == "limited", 1).otherwise(0)).alias("osm_poi_wheelchair_limited_count_200m"),
        F.sum(F.when(F.col("o.wheelchair") == "no", 1).otherwise(0)).alias("osm_poi_wheelchair_no_count_200m"),
        F.min(F.when(F.col("o.wheelchair") == "yes", F.col("dist_m"))).alias("osm_poi_wheelchair_yes_nearest_distance_m"),
        F.min(F.when(F.col("o.wheelchair") == "limited", F.col("dist_m"))).alias("osm_poi_wheelchair_limited_nearest_distance_m"),
        F.min(F.when(F.col("o.wheelchair") == "no", F.col("dist_m"))).alias("osm_poi_wheelchair_no_nearest_distance_m"),
    )
)


### Aggregate nearby step infrastructure per listing (counts, ramps, and nearest distance within 200m)


In [0]:
steps = joined.filter(F.col("o.kind") == "steps")

steps_agg = (steps
    .groupBy(F.col("a.property_id").alias("property_id"))
    .agg(
        F.count("*").alias("osm_steps_count_200m"),
        F.sum(F.when(F.col("o.ramp") == "yes", 1).otherwise(0)).alias("osm_steps_ramp_yes_count_200m"),
        F.min(F.col("dist_m")).alias("osm_steps_nearest_distance_m"),
    )
)


### Combine OSM-derived aggregates with Airbnb listings and normalize missing or out-of-range values


In [0]:
features = (airbnb.select("property_id", "city")
    .join(wheel_agg, "property_id", "left")
    .join(steps_agg, "property_id", "left")
)

# counts: null -> 0
count_cols = [
    "osm_poi_wheelchair_yes_count_200m",
    "osm_poi_wheelchair_limited_count_200m",
    "osm_poi_wheelchair_no_count_200m",
    "osm_steps_count_200m",
    "osm_steps_ramp_yes_count_200m"
]
features = features.fillna(0, subset=count_cols)

# distances: null -> 200, and cap at 200
dist_cols = [
    "osm_poi_wheelchair_yes_nearest_distance_m",
    "osm_poi_wheelchair_limited_nearest_distance_m",
    "osm_poi_wheelchair_no_nearest_distance_m",
    "osm_steps_nearest_distance_m"
]
for c in dist_cols:
    features = features.withColumn(
        c,
        F.when(F.col(c).isNull(), F.lit(200.0)).otherwise(F.least(F.col(c), F.lit(200.0)))
    )


### Derive a categorical accessibility context per listing based on nearby OSM signals within 200m


In [0]:
Y = F.col("osm_poi_wheelchair_yes_count_200m")
N = F.col("osm_poi_wheelchair_no_count_200m")
S = F.col("osm_steps_count_200m")

features = features.withColumn(
    "osm_access_context_200m",
    F.when((Y >= 3) & (N == 0) & (S <= 1), F.lit("supportive"))
     .when((N >= 2) | (S >= 5), F.lit("barrier_dense"))
     .otherwise(F.lit("mixed"))
)


### Persist the enriched Airbnb–OSM features table for downstream scoring


In [0]:
spark.sql("DROP TABLE IF EXISTS access4all_airbnb_v2_osm_9cities")
features.write.mode("overwrite").format("delta").saveAsTable("access4all_airbnb_v2_osm_9cities")


