# START OF COMMON CODE

In [1]:
POLYGON_PATH = 'https://raw.githubusercontent.com/southkorea/seoul-maps/master/juso/2015/json/seoul_neighborhoods_geo.json'
POINTS_PATH  = '../../data/SK_pollution.csv'

In [2]:
from collections import defaultdict
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point, Polygon
from typing import Callable, List, Union, Any, Dict

In [3]:
def load_polygons(polygons_geojson: str) -> gpd.GeoDataFrame:
    try:
        polygons = gpd.read_file(polygons_geojson)
        if polygons.empty:
            raise ValueError("GeoJSON file contains no data.")
        return polygons
    except Exception as e:
        raise IOError(f"Failed to load polygons from GeoJSON: {e}")

In [4]:
def polygons_find_covering_geocodes(
    polygons: gpd.GeoDataFrame,
    polygon_to_geocodes: Callable[[Polygon, int], List[str]],
    geocode_resolution: int
):
    polygons['covering_geocodes'] = polygons['geometry'].apply(lambda x: polygon_to_geocodes(x, geocode_resolution))

In [5]:
def load_polygons_with_covering_geocodes(
    polygons_geojson: str,
    polygon_to_geocodes: Callable[[Polygon, int], List[str]],
    geocode_resolution: int
) -> gpd.GeoDataFrame:
    polygons = load_polygons(polygons_geojson)
    polygons_find_covering_geocodes(polygons, polygon_to_geocodes, geocode_resolution)
    return polygons

In [6]:
def create_geocode_lookup(polygons: gpd.GeoDataFrame) -> Dict[str, List[int]]:
    geocode_lookup = defaultdict(list)
    for idx, codes in polygons['covering_geocodes'].items():
        for geocode in codes:
            geocode_lookup[geocode].append(idx)

    return geocode_lookup

In [7]:
def load_points(points_csv: str, *, crs: Any = "EPSG:4326", latitude_col_name='latitude', longitude_col_name='longitude') -> gpd.GeoDataFrame:
    try:
        df = pd.read_csv(points_csv)
        if df.empty:
            raise ValueError("CSV file contains no data.")
        if not {latitude_col_name, longitude_col_name}.issubset(df.columns):
            raise ValueError(f"CSV must contain '{latitude_col_name}' and '{longitude_col_name}' columns.")
        df.rename(columns={latitude_col_name: 'latitude', longitude_col_name: 'longitude'}, inplace=True)
        gdf = gpd.GeoDataFrame(
            df, 
            geometry=[Point(xy) for xy in zip(df.longitude, df.latitude)],
            crs=crs
        )
        return gdf
    except Exception as e:
        raise IOError(f"Failed to load points from CSV: {e}")

In [8]:
def points_find_geocode(
    points: gpd.GeoDataFrame,
    point_to_geocode: Callable[[float, float, int], str],
    geocode_resolution: int
):
    points['geocode'] = points.apply(
        lambda row: point_to_geocode(row['latitude'], row['longitude'], geocode_resolution),
        axis=1
    )

In [9]:
def load_points_with_geocode(
    points_csv: str,
    point_to_geocode: Callable[[float, float, int], str],
    geocode_resolution: int,
    *,
    crs: Any = "EPSG:4326",
    latitude_col_name='latitude',
    longitude_col_name='longitude'
) -> gpd.GeoDataFrame:
    gdf = load_points(points_csv, crs=crs, latitude_col_name=latitude_col_name, longitude_col_name=longitude_col_name)
    points_find_geocode(gdf, point_to_geocode, geocode_resolution)
    return gdf

In [10]:
def spatial_join_with_geocodes(
    polygons: gpd.GeoDataFrame,
    geocode_lookup: Dict[str, List[int]],
    points: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    # Create a copy
    points = points.copy()

    # Filter step: find potential matches
    points['potential_matches'] = points['geocode'].map(geocode_lookup)

    # Refinement step: verify actual containment
    def verify_containment(point: pd.Series, candidate_polygons: Union[List[int], float]) -> int:
        if isinstance(candidate_polygons, float):
            # assert pd.isna(candidate_polygons)
            return -1
        if not candidate_polygons:
            return -1
        point_geom = point['geometry']
        for index in candidate_polygons:
            polygon = polygons.loc[index, 'geometry']
            if point_geom.within(polygon):
                return index
        return -1

    # Apply refinement
    points['polygon_index'] = points.apply(lambda row: verify_containment(row, row['potential_matches']), axis=1)

    return points

# START OF H3-SPECIFIC CODE

## geojson2h3 Python module

In [11]:
import h3

In [12]:
FEATURE = 'Feature'
FEATURE_COLLECTION = 'FeatureCollection'
POLYGON = 'Polygon'
MULTI_POLYGON = 'MultiPolygon'


def flatten(arrays):
    """
    Utility for efficient flattening of arrays. This mutates input,
    flattening into the first array in the list.
    """
    out = None
    for arr in arrays:
        if out is not None:
            out.extend(arr)
        else:
            out = arr
            if not isinstance(out, list):
                out = list(out)
    return list(set(out))


def centroid(polygon):
    """
    Utility to compute the centroid of a polygon.
    """
    lng_sum = 0
    lat_sum = 0
    count = 0
    loop = polygon[0]
    for point in loop:
        lng_sum += point[0]
        lat_sum += point[1]
        count += 1
    return lng_sum / count, lat_sum / count


def feature_collection_to_h3_set(feature_collection, resolution):
    """
    Convert a GeoJSON feature collection to a set of hexagons.
    """
    features = feature_collection.get('features')
    if not features:
        raise ValueError('No features found')
    return flatten(list(map(lambda feature: feature_to_h3_set(feature, resolution), features)))


def feature_to_h3_set(feature, resolution, ensure_output=False):
    """
    Convert a GeoJSON feature to a set of hexagons.
    """
    type = feature.get('type')
    geometry = feature.get('geometry')
    geometry_type = None if geometry is None else geometry.get('type')

    if type == FEATURE_COLLECTION:
        return feature_collection_to_h3_set(feature, resolution)

    if type != FEATURE:
        raise ValueError(f"Unhandled type: {type}")

    if geometry_type not in (POLYGON, MULTI_POLYGON):
        raise ValueError(f"Unhandled geometry type: {geometry_type}")

    # Ensure the correct format for the polygon argument
    if geometry_type == POLYGON:
        polygons = [geometry['coordinates']]
    else:  # MULTI_POLYGON
        polygons = geometry['coordinates']

    def process(polygon):
        result = h3.polyfill({'type': 'Polygon', 'coordinates': polygon}, resolution, True)
        if result or not ensure_output:
            return result

        # If we got no results, index the centroid
        lng, lat = centroid(polygon)
        return [h3.geo_to_h3(lat, lng, resolution)]

    # Polyfill each polygon and flatten the results
    return flatten(
        list(map(process, polygons))
    )


def h3_to_feature(h3_index, properties=None):
    """
    Convert a single H3 hexagon to a `Polygon` feature.
    """
    coordinates = [h3.h3_to_geo_boundary(h3_index, True)]
    return {
        "type": FEATURE,
        "id": h3_index,
        "properties": properties if properties else {},
        "geometry": {
            "type": POLYGON,
            "coordinates": coordinates
        }
    }


def h3_set_to_feature(hexagons, properties=None):
    """
    Convert a set of hexagons to a GeoJSON `Feature` with the set outline(s).
    """
    polygons = h3.h3_set_to_multi_polygon(hexagons, True)
    is_multi_polygon = len(polygons) > 1
    geometry_type = MULTI_POLYGON if is_multi_polygon else POLYGON
    coordinates = polygons if is_multi_polygon else polygons[0] if polygons else []
    return {
        "type": FEATURE,
        "properties": properties if properties else {},
        "geometry": {
            "type": geometry_type,
            "coordinates": coordinates
        }
    }


def h3_set_to_multi_polygon_feature(hexagons, properties=None):
    """
    Convert a set of hexagons to a GeoJSON `MultiPolygon` feature with the
    outlines of each individual hexagon.
    """
    coordinates = list(map(
        lambda h3Index: [h3.h3ToGeoBoundary(h3Index, True)],  # Wrap in an array for a single-loop polygon
        hexagons
    ))
    return {
        "type": FEATURE,
        "properties": properties if properties else {},
        "geometry": {
            "type": MULTI_POLYGON,
            "coordinates": coordinates
        }
    }


def h3_set_to_feature_collection(hexagons, get_properties=None):
    """
    Convert a set of hexagons to a GeoJSON `FeatureCollection` with each hexagon
    in a separate `Polygon` feature with optional properties.
    """
    features = []
    for h3_index in hexagons:
        properties = get_properties(h3_index) if get_properties else {}
        features.append(h3_to_feature(h3_index, properties))
    return {
        "type": FEATURE_COLLECTION,
        "features": features
    }

## Remainder of geocovering code

In [13]:
def polygon_to_h3(polygon: Polygon, resolution: int) -> List[str]:
    feature = {
        "type": "Feature",
        "properties": {},
        "geometry": polygon.__geo_interface__
    }
    return feature_to_h3_set(feature, resolution, True)

In [14]:
def point_to_h3(latitude: float, longitude: float, resolution: int) -> str:
    return h3.geo_to_h3(latitude, longitude, resolution)

## Testing

In [15]:
h3_resolution = 9

In [16]:
# Load data
polygons = load_polygons_with_covering_geocodes(
    POLYGON_PATH,
    polygon_to_h3,
    h3_resolution
)
points = load_points_with_geocode(
    POINTS_PATH,
    point_to_h3,
    h3_resolution,
    latitude_col_name='Latitude',
    longitude_col_name='Longitude'
)

In [17]:
polygons.head()

Unnamed: 0,EMD_CD,EMD_KOR_NM,EMD_ENG_NM,ESRI_PK,SHAPE_AREA,SHAPE_LEN,geometry,covering_geocodes
0,11590108,대방동,Daebang-dong,18,0.000157,0.063998,"POLYGON ((126.93240 37.51558, 126.93243 37.515...","[8930e1d9063ffff, 8930e1d9037ffff, 8930e1d9067..."
1,11110118,내수동,Naesu-dong,59,7e-06,0.011823,"POLYGON ((126.97318 37.57399, 126.97328 37.573...",[8930e1d8c8bffff]
2,11590105,흑석동,Heukseok-dong,15,0.000171,0.05742,"POLYGON ((126.96639 37.51014, 126.96687 37.509...","[8930e1d98dbffff, 8930e1d9e3bffff, 8930e1d98cf..."
3,11320107,창동,Chang-dong,1,0.000436,0.100323,"POLYGON ((127.05147 37.66043, 127.05150 37.659...","[8930e1c202fffff, 8930e1c2077ffff, 8930e1c2287..."
4,11230107,청량리동,Cheongnyangni-dong,8,0.000122,0.064774,"POLYGON ((127.04305 37.59696, 127.04361 37.596...","[8930e1c336fffff, 8930e1c159bffff, 8930e1c15d7..."


In [18]:
polygons['EMD_ENG_NM'].nunique()

465

In [19]:
points.head()

Unnamed: 0,Measurement date,Station code,Address,latitude,longitude,SO2,NO2,O3,CO,PM10,PM2.5,geometry,geocode
0,2017-01-01 00:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.059,0.002,1.2,73.0,57.0,POINT (127.00501 37.57202),8930e1d8d67ffff
1,2017-01-01 01:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.058,0.002,1.2,71.0,59.0,POINT (127.00501 37.57202),8930e1d8d67ffff
2,2017-01-01 02:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.056,0.002,1.2,70.0,59.0,POINT (127.00501 37.57202),8930e1d8d67ffff
3,2017-01-01 03:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.056,0.002,1.2,70.0,58.0,POINT (127.00501 37.57202),8930e1d8d67ffff
4,2017-01-01 04:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.003,0.051,0.002,1.2,69.0,61.0,POINT (127.00501 37.57202),8930e1d8d67ffff


In [20]:
points['geocode'].nunique()

25

In [21]:
# Create lookup dictionary
geocode_lookup = create_geocode_lookup(polygons)

In [22]:
# Using the spatial join function with H3
h3_results = spatial_join_with_geocodes(
    polygons,
    geocode_lookup,
    points
)

In [23]:
h3_results

Unnamed: 0,Measurement date,Station code,Address,latitude,longitude,SO2,NO2,O3,CO,PM10,PM2.5,geometry,geocode,potential_matches,polygon_index
0,2017-01-01 00:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.059,0.002,1.2,73.0,57.0,POINT (127.00501 37.57202),8930e1d8d67ffff,[98],98
1,2017-01-01 01:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.058,0.002,1.2,71.0,59.0,POINT (127.00501 37.57202),8930e1d8d67ffff,[98],98
2,2017-01-01 02:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.056,0.002,1.2,70.0,59.0,POINT (127.00501 37.57202),8930e1d8d67ffff,[98],98
3,2017-01-01 03:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.056,0.002,1.2,70.0,58.0,POINT (127.00501 37.57202),8930e1d8d67ffff,[98],98
4,2017-01-01 04:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.003,0.051,0.002,1.2,69.0,61.0,POINT (127.00501 37.57202),8930e1d8d67ffff,[98],98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647506,2019-12-31 19:00,125,"59, Gucheonmyeon-ro 42-gil, Gangdong-gu, Seoul...",37.544962,127.136792,0.003,0.028,0.013,0.5,23.0,17.0,POINT (127.13679 37.54496),8930e1cf583ffff,[225],225
647507,2019-12-31 20:00,125,"59, Gucheonmyeon-ro 42-gil, Gangdong-gu, Seoul...",37.544962,127.136792,0.003,0.025,0.015,0.4,25.0,19.0,POINT (127.13679 37.54496),8930e1cf583ffff,[225],225
647508,2019-12-31 21:00,125,"59, Gucheonmyeon-ro 42-gil, Gangdong-gu, Seoul...",37.544962,127.136792,0.003,0.023,0.015,0.4,24.0,17.0,POINT (127.13679 37.54496),8930e1cf583ffff,[225],225
647509,2019-12-31 22:00,125,"59, Gucheonmyeon-ro 42-gil, Gangdong-gu, Seoul...",37.544962,127.136792,0.003,0.040,0.004,0.5,25.0,18.0,POINT (127.13679 37.54496),8930e1cf583ffff,[225],225


# Comparison with traditional spatial join

In [24]:
def spatial_join_traditional(
    polygons: gpd.GeoDataFrame,
    points: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    # Create a copy
    points = points.copy()

    def find_containment(point: pd.Series) -> int:
        point_geom = point['geometry']
        for index, row in polygons.iterrows():
            polygon = row['geometry']
            if point_geom.within(polygon):
                return index
        return -1

    points['polygon_index'] = points.apply(find_containment, axis=1)

    return points

In [25]:
points_baseline = load_points(POINTS_PATH, latitude_col_name='Latitude', longitude_col_name='Longitude')
polygons_baseline = load_polygons(POLYGON_PATH)

In [26]:
# baseline_results = gpd.sjoin(points_baseline, polygons_baseline, how="inner", predicate='within')
# baseline_results['polygon_index'] = baseline_results['index_right']

baseline_results = spatial_join_traditional(polygons_baseline, points_baseline)

In [27]:
def evaluate_results(test_results: gpd.GeoDataFrame, baseline_results: gpd.GeoDataFrame, method_name):
    # Aligning indices for comparison
    test_results = test_results[['polygon_index']].copy()
    baseline_results = baseline_results[['polygon_index']].copy()

    # print(test_results['polygon_index'][test_results['polygon_index'] == -1])

    # Join the test results with the baseline on the point index
    comparison_df = test_results.join(baseline_results, rsuffix='_baseline', how='inner')

    # Calculate the match rate
    match_rate = (comparison_df['polygon_index'] == comparison_df['polygon_index_baseline']).mean()
    
    print(f"Match Rate for {method_name}: {match_rate * 100}%")

In [28]:
evaluate_results(h3_results, baseline_results, f"H3 with resolution {h3_resolution}")

Match Rate for H3 with resolution 9: 96.00068570263672%


For references, the match rate for H3 with resolution 8 was 87.99943166988669% for this specific dataset.