# START OF COMMON CODE

In [17]:
POLYGON_PATH = 'https://raw.githubusercontent.com/IsamAljawarneh/datasets/1c2a6af7dea7aa93105ac1d1d0118d07bd681d8a/data/nyc_polygon.geojson'
POINTS_PATH  = 'https://raw.githubusercontent.com/IsamAljawarneh/datasets/1c2a6af7dea7aa93105ac1d1d0118d07bd681d8a/data/NYC_Pilot2_PM_Part1.csv'

In [1]:
from collections import defaultdict
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point, Polygon
from typing import Callable, List, Union, Any, Dict

In [2]:
def load_polygons(polygons_geojson: str) -> gpd.GeoDataFrame:
    try:
        polygons = gpd.read_file(polygons_geojson)
        if polygons.empty:
            raise ValueError("GeoJSON file contains no data.")
        return polygons
    except Exception as e:
        raise IOError(f"Failed to load polygons from GeoJSON: {e}")

In [3]:
def polygons_find_covering_geocodes(
    polygons: gpd.GeoDataFrame,
    polygon_to_geocodes: Callable[[Polygon, int], List[str]],
    geocode_resolution: int
):
    polygons['covering_geocodes'] = polygons['geometry'].apply(lambda x: polygon_to_geocodes(x, geocode_resolution))

In [4]:
def load_polygons_with_covering_geocodes(
    polygons_geojson: str,
    polygon_to_geocodes: Callable[[Polygon, int], List[str]],
    geocode_resolution: int
) -> gpd.GeoDataFrame:
    polygons = load_polygons(polygons_geojson)
    polygons_find_covering_geocodes(polygons, polygon_to_geocodes, geocode_resolution)
    return polygons

In [5]:
def create_geocode_lookup(polygons: gpd.GeoDataFrame) -> Dict[str, List[int]]:
    geocode_lookup = defaultdict(list)
    for idx, codes in polygons['covering_geocodes'].items():
        for geocode in codes:
            geocode_lookup[geocode].append(idx)

    return geocode_lookup

In [6]:
def load_points(points_csv: str, crs: Any = "EPSG:4326") -> gpd.GeoDataFrame:
    try:
        df = pd.read_csv(points_csv)
        if df.empty:
            raise ValueError("CSV file contains no data.")
        if not {'latitude', 'longitude'}.issubset(df.columns):
            raise ValueError("CSV must contain 'latitude' and 'longitude' columns.")
        gdf = gpd.GeoDataFrame(
            df, 
            geometry=[Point(xy) for xy in zip(df.longitude, df.latitude)],
            crs=crs
        )
        return gdf
    except Exception as e:
        raise IOError(f"Failed to load points from CSV: {e}")

In [7]:
def points_find_geocode(
    points: gpd.GeoDataFrame,
    point_to_geocode: Callable[[float, float, int], str],
    geocode_resolution: int
):
    points['geocode'] = points.apply(
        lambda row: point_to_geocode(row['latitude'], row['longitude'], geocode_resolution),
        axis=1
    )

In [9]:
def load_points_with_geocode(
    points_csv: str,
    point_to_geocode: Callable[[float, float, int], str],
    geocode_resolution: int,
    crs: Any = "EPSG:4326"
) -> gpd.GeoDataFrame:
    gdf = load_points(points_csv, crs)
    points_find_geocode(gdf, point_to_geocode, geocode_resolution)
    return gdf

In [10]:
def spatial_join_with_geocodes(
    polygons: gpd.GeoDataFrame,
    geocode_lookup: Dict[str, List[int]],
    points: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    # Create a copy
    points = points.copy()

    # Filter step: find potential matches
    points['potential_matches'] = points['geocode'].map(geocode_lookup)

    # Refinement step: verify actual containment
    def verify_containment(point: pd.Series, candidate_polygons: Union[List[int], float]) -> int:
        if isinstance(candidate_polygons, float):
            # assert pd.isna(candidate_polygons)
            return -1
        if not candidate_polygons:
            return -1
        point_geom = point['geometry']
        for index in candidate_polygons:
            polygon = polygons.loc[index, 'geometry']
            if point_geom.within(polygon):
                return index
        return -1

    # Apply refinement
    points['polygon_index'] = points.apply(lambda row: verify_containment(row, row['potential_matches']), axis=1)

    return points

# START OF GEOHASH-SPECIFIC CODE

## geohash Python module

In [12]:
import queue
import sys

In [13]:
"""
Copyright (C) 2009 Hiroaki Kawai <kawai@iij.ad.jp>
Version 0.8.5
"""
_base32 = '0123456789bcdefghjkmnpqrstuvwxyz'
_base32_map = {_base32[i]: i for i in range(len(_base32))}

def _float_hex_to_int(f):
	if f<-1.0 or f>=1.0:
		return None
	
	if f==0.0:
		return 1,1
	
	h = f.hex()
	x = h.find("0x1.")
	assert(x>=0)
	p = h.find("p")
	assert(p>0)
	
	half_len = len(h[x+4:p])*4-int(h[p+1:])
	if x==0:
		r = (1<<half_len) + ((1<<(len(h[x+4:p])*4)) + int(h[x+4:p],16))
	else:
		r = (1<<half_len) - ((1<<(len(h[x+4:p])*4)) + int(h[x+4:p],16))
	
	return r, half_len+1

def _int_to_float_hex(i, l):
	if l==0:
		return -1.0
	
	half = 1<<(l-1)
	s = int((l+3)/4)
	if i >= half:
		i = i-half
		return float.fromhex(("0x0.%0"+str(s)+"xp1") % (i<<(s*4-l),))
	else:
		i = half-i
		return float.fromhex(("-0x0.%0"+str(s)+"xp1") % (i<<(s*4-l),))

def _encode_i2c(lat,lon,lat_length,lon_length):
	precision = int((lat_length+lon_length)/5)
	if lat_length < lon_length:
		a = lon
		b = lat
	else:
		a = lat
		b = lon
	
	boost = (0,1,4,5,16,17,20,21)
	ret = ''
	for i in range(precision):
		ret+=_base32[(boost[a&7]+(boost[b&3]<<1))&0x1F]
		t = a>>3
		a = b>>2
		b = t
	
	return ret[::-1]

def geohash_encode(latitude, longitude, precision=12):
	if latitude >= 90.0 or latitude < -90.0:
		raise Exception("invalid latitude.")
	while longitude < -180.0:
		longitude += 360.0
	while longitude >= 180.0:
		longitude -= 360.0
	
	xprecision=precision+1
	lat_length = lon_length = int(xprecision*5/2)
	if xprecision%2==1:
		lon_length+=1
	
	if hasattr(float, "fromhex"):
		a = _float_hex_to_int(latitude/90.0)
		o = _float_hex_to_int(longitude/180.0)
		if a[1] > lat_length:
			ai = a[0]>>(a[1]-lat_length)
		else:
			ai = a[0]<<(lat_length-a[1])
		
		if o[1] > lon_length:
			oi = o[0]>>(o[1]-lon_length)
		else:
			oi = o[0]<<(lon_length-o[1])
		
		return _encode_i2c(ai, oi, lat_length, lon_length)[:precision]
	
	lat = latitude/180.0
	lon = longitude/360.0
	
	if lat>0:
		lat = int((1<<lat_length)*lat)+(1<<(lat_length-1))
	else:
		lat = (1<<lat_length-1)-int((1<<lat_length)*(-lat))
	
	if lon>0:
		lon = int((1<<lon_length)*lon)+(1<<(lon_length-1))
	else:
		lon = (1<<lon_length-1)-int((1<<lon_length)*(-lon))
	
	return _encode_i2c(lat,lon,lat_length,lon_length)[:precision]

def _decode_c2i(hashcode):
	lon = 0
	lat = 0
	bit_length = 0
	lat_length = 0
	lon_length = 0
	for i in hashcode:
		t = _base32_map[i]
		if bit_length%2==0:
			lon = lon<<3
			lat = lat<<2
			lon += (t>>2)&4
			lat += (t>>2)&2
			lon += (t>>1)&2
			lat += (t>>1)&1
			lon += t&1
			lon_length+=3
			lat_length+=2
		else:
			lon = lon<<2
			lat = lat<<3
			lat += (t>>2)&4
			lon += (t>>2)&2
			lat += (t>>1)&2
			lon += (t>>1)&1
			lat += t&1
			lon_length+=2
			lat_length+=3
		
		bit_length+=5
	
	return (lat,lon,lat_length,lon_length)

def geohash_decode(hashcode, delta=False):
	'''
	decode a hashcode and get center coordinate, and distance between center and outer border
	'''
	(lat,lon,lat_length,lon_length) = _decode_c2i(hashcode)
	
	if hasattr(float, "fromhex"):
		latitude_delta  = 90.0/(1<<lat_length)
		longitude_delta = 180.0/(1<<lon_length)
		latitude = _int_to_float_hex(lat, lat_length) * 90.0 + latitude_delta
		longitude = _int_to_float_hex(lon, lon_length) * 180.0 + longitude_delta
		if delta:
			return latitude,longitude,latitude_delta,longitude_delta
		return latitude,longitude
	
	lat = (lat<<1) + 1
	lon = (lon<<1) + 1
	lat_length += 1
	lon_length += 1
	
	latitude  = 180.0*(lat-(1<<(lat_length-1)))/(1<<lat_length)
	longitude = 360.0*(lon-(1<<(lon_length-1)))/(1<<lon_length)
	if delta:
		latitude_delta  = 180.0/(1<<lat_length)
		longitude_delta = 360.0/(1<<lon_length)
		return latitude,longitude,latitude_delta,longitude_delta
	
	return latitude,longitude

def geohash_decode_exactly(hashcode):
	return geohash_decode(hashcode, True)

## hashcode operations below

def geohash_bbox(hashcode):
	'''
	decode a hashcode and get north, south, east and west border.
	'''
	(lat,lon,lat_length,lon_length) = _decode_c2i(hashcode)
	if hasattr(float, "fromhex"):
		latitude_delta  = 180.0/(1<<lat_length)
		longitude_delta = 360.0/(1<<lon_length)
		latitude = _int_to_float_hex(lat, lat_length) * 90.0
		longitude = _int_to_float_hex(lon, lon_length) * 180.0
		return {"s":latitude, "w":longitude, "n":latitude+latitude_delta, "e":longitude+longitude_delta}
	
	ret={}
	if lat_length:
		ret['n'] = 180.0*(lat+1-(1<<(lat_length-1)))/(1<<lat_length)
		ret['s'] = 180.0*(lat-(1<<(lat_length-1)))/(1<<lat_length)
	else: # can't calculate the half with bit shifts (negative shift)
		ret['n'] = 90.0
		ret['s'] = -90.0
	
	if lon_length:
		ret['e'] = 360.0*(lon+1-(1<<(lon_length-1)))/(1<<lon_length)
		ret['w'] = 360.0*(lon-(1<<(lon_length-1)))/(1<<lon_length)
	else: # can't calculate the half with bit shifts (negative shift)
		ret['e'] = 180.0
		ret['w'] = -180.0
	
	return ret

def geohash_neighbors(hashcode):
	(lat,lon,lat_length,lon_length) = _decode_c2i(hashcode)
	ret = []
	tlat = lat
	for tlon in (lon-1, lon+1):
		code = _encode_i2c(tlat,tlon,lat_length,lon_length)
		if code:
			ret.append(code)
	
	tlat = lat+1
	if not tlat >> lat_length:
		for tlon in (lon-1, lon, lon+1):
			ret.append(_encode_i2c(tlat,tlon,lat_length,lon_length))
	
	tlat = lat-1
	if tlat >= 0:
		for tlon in (lon-1, lon, lon+1):
			ret.append(_encode_i2c(tlat,tlon,lat_length,lon_length))
	
	return ret

def geohash_expand(hashcode):
	ret = geohash_neighbors(hashcode)
	ret.append(hashcode)
	return ret

def _uint64_interleave(lat32, lon32):
	intr = 0
	boost = (0,1,4,5,16,17,20,21,64,65,68,69,80,81,84,85)
	for i in range(8):
		intr = (intr<<8) + (boost[(lon32>>(28-i*4))%16]<<1) + boost[(lat32>>(28-i*4))%16]
	
	return intr

def _uint64_deinterleave(ui64):
	lat = lon = 0
	boost = ((0,0),(0,1),(1,0),(1,1),(0,2),(0,3),(1,2),(1,3),
			(2,0),(2,1),(3,0),(3,1),(2,2),(2,3),(3,2),(3,3))
	for i in range(16):
		p = boost[(ui64>>(60-i*4))%16]
		lon = (lon<<2) + p[0]
		lat = (lat<<2) + p[1]
	
	return (lat, lon)

def encode_uint64(latitude, longitude):
	if latitude >= 90.0 or latitude < -90.0:
		raise ValueError("Latitude must be in the range of (-90.0, 90.0)")
	while longitude < -180.0:
		longitude += 360.0
	while longitude >= 180.0:
		longitude -= 360.0
	
	lat = int(((latitude + 90.0)/180.0)*(1<<32))
	lon = int(((longitude+180.0)/360.0)*(1<<32))
	return _uint64_interleave(lat, lon)

def decode_uint64(ui64):
	lat,lon = _uint64_deinterleave(ui64)
	return (180.0*lat/(1<<32) - 90.0, 360.0*lon/(1<<32) - 180.0)

def expand_uint64(ui64, precision=50):
	ui64 = ui64 & (0xFFFFFFFFFFFFFFFF << (64-precision))
	lat,lon = _uint64_deinterleave(ui64)
	lat_grid = 1<<(32-int(precision/2))
	lon_grid = lat_grid>>(precision%2)
	
	if precision<=2: # expand becomes to the whole range
		return []
	
	ranges = []
	if lat & lat_grid:
		if lon & lon_grid:
			ui64 = _uint64_interleave(lat-lat_grid, lon-lon_grid)
			ranges.append((ui64, ui64 + (1<<(64-precision+2))))
			if precision%2==0:
				# lat,lon = (1, 1) and even precision
				ui64 = _uint64_interleave(lat-lat_grid, lon+lon_grid)
				ranges.append((ui64, ui64 + (1<<(64-precision+1))))
				
				if lat + lat_grid < 0xFFFFFFFF:
					ui64 = _uint64_interleave(lat+lat_grid, lon-lon_grid)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
					ui64 = _uint64_interleave(lat+lat_grid, lon)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
					ui64 = _uint64_interleave(lat+lat_grid, lon+lon_grid)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
			else:
				# lat,lon = (1, 1) and odd precision
				if lat + lat_grid < 0xFFFFFFFF:
					ui64 = _uint64_interleave(lat+lat_grid, lon-lon_grid)
					ranges.append((ui64, ui64 + (1<<(64-precision+1))))
					
					ui64 = _uint64_interleave(lat+lat_grid, lon+lon_grid)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
				
				ui64 = _uint64_interleave(lat, lon+lon_grid)
				ranges.append((ui64, ui64 + (1<<(64-precision))))
				ui64 = _uint64_interleave(lat-lat_grid, lon+lon_grid)
				ranges.append((ui64, ui64 + (1<<(64-precision))))
		else:
			ui64 = _uint64_interleave(lat-lat_grid, lon)
			ranges.append((ui64, ui64 + (1<<(64-precision+2))))
			if precision%2==0:
				# lat,lon = (1, 0) and odd precision
				ui64 = _uint64_interleave(lat-lat_grid, lon-lon_grid)
				ranges.append((ui64, ui64 + (1<<(64-precision+1))))
				
				if lat + lat_grid < 0xFFFFFFFF:
					ui64 = _uint64_interleave(lat+lat_grid, lon-lon_grid)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
					ui64 = _uint64_interleave(lat+lat_grid, lon)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
					ui64 = _uint64_interleave(lat+lat_grid, lon+lon_grid)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
			else:
				# lat,lon = (1, 0) and odd precision
				if lat + lat_grid < 0xFFFFFFFF:
					ui64 = _uint64_interleave(lat+lat_grid, lon)
					ranges.append((ui64, ui64 + (1<<(64-precision+1))))
					
					ui64 = _uint64_interleave(lat+lat_grid, lon-lon_grid)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
				ui64 = _uint64_interleave(lat, lon-lon_grid)
				ranges.append((ui64, ui64 + (1<<(64-precision))))
				ui64 = _uint64_interleave(lat-lat_grid, lon-lon_grid)
				ranges.append((ui64, ui64 + (1<<(64-precision))))
	else:
		if lon & lon_grid:
			ui64 = _uint64_interleave(lat, lon-lon_grid)
			ranges.append((ui64, ui64 + (1<<(64-precision+2))))
			if precision%2==0:
				# lat,lon = (0, 1) and even precision
				ui64 = _uint64_interleave(lat, lon+lon_grid)
				ranges.append((ui64, ui64 + (1<<(64-precision+1))))
				
				if lat > 0:
					ui64 = _uint64_interleave(lat-lat_grid, lon-lon_grid)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
					ui64 = _uint64_interleave(lat-lat_grid, lon)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
					ui64 = _uint64_interleave(lat-lat_grid, lon+lon_grid)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
			else:
				# lat,lon = (0, 1) and odd precision
				if lat > 0:
					ui64 = _uint64_interleave(lat-lat_grid, lon-lon_grid)
					ranges.append((ui64, ui64 + (1<<(64-precision+1))))
					
					ui64 = _uint64_interleave(lat-lat_grid, lon+lon_grid)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
				ui64 = _uint64_interleave(lat, lon+lon_grid)
				ranges.append((ui64, ui64 + (1<<(64-precision))))
				ui64 = _uint64_interleave(lat+lat_grid, lon+lon_grid)
				ranges.append((ui64, ui64 + (1<<(64-precision))))
		else:
			ui64 = _uint64_interleave(lat, lon)
			ranges.append((ui64, ui64 + (1<<(64-precision+2))))
			if precision%2==0:
				# lat,lon = (0, 0) and even precision
				ui64 = _uint64_interleave(lat, lon-lon_grid)
				ranges.append((ui64, ui64 + (1<<(64-precision+1))))
				
				if lat > 0:
					ui64 = _uint64_interleave(lat-lat_grid, lon-lon_grid)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
					ui64 = _uint64_interleave(lat-lat_grid, lon)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
					ui64 = _uint64_interleave(lat-lat_grid, lon+lon_grid)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
			else:
				# lat,lon = (0, 0) and odd precision
				if lat > 0:
					ui64 = _uint64_interleave(lat-lat_grid, lon)
					ranges.append((ui64, ui64 + (1<<(64-precision+1))))
					
					ui64 = _uint64_interleave(lat-lat_grid, lon-lon_grid)
					ranges.append((ui64, ui64 + (1<<(64-precision))))
				ui64 = _uint64_interleave(lat, lon-lon_grid)
				ranges.append((ui64, ui64 + (1<<(64-precision))))
				ui64 = _uint64_interleave(lat+lat_grid, lon-lon_grid)
				ranges.append((ui64, ui64 + (1<<(64-precision))))
	
	ranges.sort()
	
	# merge the conditions
	shrink = []
	prev = None
	for i in ranges:
		if prev:
			if prev[1] != i[0]:
				shrink.append(prev)
				prev = i
			else:
				prev = (prev[0], i[1])
		else:
			prev = i
	
	shrink.append(prev)
	
	ranges = []
	for i in shrink:
		a,b=i
		if a == 0:
			a = None # we can remove the condition because it is the lowest value
		if b == 0x10000000000000000:
			b = None # we can remove the condition because it is the highest value
		
		ranges.append((a,b))
	
	return ranges

## polygon geohasher Python module

In [14]:
def geohash_to_polygon(geo):
    """
    :param geo: String that represents the geohash.
    :return: Returns a Shapely's Polygon instance that represents the geohash.
    """
    lat_centroid, lng_centroid, lat_offset, lng_offset = geohash_decode_exactly(geo)

    corner_1 = (lat_centroid - lat_offset, lng_centroid - lng_offset)[::-1]
    corner_2 = (lat_centroid - lat_offset, lng_centroid + lng_offset)[::-1]
    corner_3 = (lat_centroid + lat_offset, lng_centroid + lng_offset)[::-1]
    corner_4 = (lat_centroid + lat_offset, lng_centroid - lng_offset)[::-1]

    return Polygon([corner_1, corner_2, corner_3, corner_4, corner_1])


In [15]:
def polygon_to_geohashes(polygon: Polygon, precision: int) -> List[str]:
    inner_geohashes = set()
    outer_geohashes = set()

    envelope = polygon.envelope
    centroid = polygon.centroid

    testing_geohashes = queue.Queue()
    testing_geohashes.put(geohash_encode(centroid.y, centroid.x, precision))

    while not testing_geohashes.empty():
        current_geohash = testing_geohashes.get()

        if (
            current_geohash not in inner_geohashes
            and current_geohash not in outer_geohashes
        ):
            current_polygon = geohash_to_polygon(current_geohash)
            if envelope.intersects(current_polygon):
                if polygon.intersects(current_polygon):
                    inner_geohashes.add(current_geohash)
                else:
                    outer_geohashes.add(current_geohash)
                for neighbor in geohash_neighbors(current_geohash):
                    if (
                        neighbor not in inner_geohashes
                        and neighbor not in outer_geohashes
                    ):
                        testing_geohashes.put(neighbor)

    return inner_geohashes

## Remainder of geocovering code

In [16]:
def point_to_geohash(latitude: float, longitude: float, resolution: int) -> str:
    return geohash_encode(latitude, longitude, precision=resolution)

## Testing

In [19]:
# Load data
polygons = load_polygons_with_covering_geocodes(
    POLYGON_PATH,
    polygon_to_geohashes,
    6  # Geohash precision
)
points = load_points_with_geocode(
    POINTS_PATH,
    point_to_geohash,
    6  # Geohash precision
)

In [20]:
# Create lookup dictionary
geocode_lookup = create_geocode_lookup(polygons)

In [21]:
# Using the spatial join function with geohash
geohash_results = spatial_join_with_geocodes(
    polygons,
    geocode_lookup,
    points
)

In [22]:
geohash_results

Unnamed: 0,SensorID,time,latitude,longitude,bin0,bin1,bin2,bin3,bin4,bin5,...,bin21,bin22,bin23,temperature,humidity,pm25,geometry,geocode,potential_matches,polygon_index
0,NYCP2_CS01A,1631277304,40.847672,-73.869316,11,1,1,0,0,0,...,0,0,0,23.7,57.3,4.508813,POINT (-73.86932 40.84767),dr72rh,"[38, 39, 193, 292]",38
1,NYCP2_CS01A,1631277308,40.847668,-73.869316,22,4,1,0,0,2,...,0,0,0,23.7,57.8,5.462420,POINT (-73.86932 40.84767),dr72rh,"[38, 39, 193, 292]",38
2,NYCP2_CS01A,1631277313,40.847649,-73.869362,40,1,1,0,0,1,...,0,0,0,23.7,57.8,5.154881,POINT (-73.86936 40.84765),dr72rh,"[38, 39, 193, 292]",38
3,NYCP2_CS01A,1631277318,40.847649,-73.869362,26,1,0,0,0,0,...,0,0,0,23.6,57.6,4.508813,POINT (-73.86936 40.84765),dr72rh,"[38, 39, 193, 292]",38
4,NYCP2_CS01A,1631277323,40.847649,-73.869362,44,4,0,1,0,0,...,0,0,0,23.6,57.5,5.539503,POINT (-73.86936 40.84765),dr72rh,"[38, 39, 193, 292]",38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169994,NYCP2_CS03A,1631457109,40.823353,-73.890488,115,11,2,0,1,0,...,0,0,0,24.6,54.8,5.460360,POINT (-73.89049 40.82335),dr72nx,"[58, 140, 176, 266]",176
169995,NYCP2_CS03A,1631457114,40.823349,-73.890480,132,8,2,0,0,0,...,0,0,0,24.6,54.8,5.298209,POINT (-73.89048 40.82335),dr72nx,"[58, 140, 176, 266]",176
169996,NYCP2_CS03A,1631457119,40.823349,-73.890480,147,14,0,0,0,0,...,0,0,0,24.6,54.8,6.470661,POINT (-73.89048 40.82335),dr72nx,"[58, 140, 176, 266]",176
169997,NYCP2_CS03A,1631457124,40.823345,-73.890488,121,8,2,0,1,1,...,0,0,0,24.6,54.6,6.424142,POINT (-73.89049 40.82335),dr72nx,"[58, 140, 176, 266]",176


# Comparison with traditional spatial join

In [23]:
def spatial_join_traditional(
    polygons: gpd.GeoDataFrame,
    points: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    # Create a copy
    points = points.copy()

    def find_containment(point: pd.Series) -> int:
        point_geom = point['geometry']
        for index, row in polygons.iterrows():
            polygon = row['geometry']
            if point_geom.within(polygon):
                return index
        return -1

    points['polygon_index'] = points.apply(find_containment, axis=1)

    return points

In [24]:
points_baseline = load_points(POINTS_PATH)
polygons_baseline = load_polygons(POLYGON_PATH)

In [25]:
# baseline_results = gpd.sjoin(points_baseline, polygons_baseline, how="inner", predicate='within')
# baseline_results['polygon_index'] = baseline_results['index_right']

baseline_results = spatial_join_traditional(polygons_baseline, points_baseline)

In [26]:
def evaluate_results(test_results: gpd.GeoDataFrame, baseline_results: gpd.GeoDataFrame, method_name):
    # Aligning indices for comparison
    test_results = test_results[['polygon_index']].copy()
    baseline_results = baseline_results[['polygon_index']].copy()

    # print(test_results['polygon_index'][test_results['polygon_index'] == -1])

    # Join the test results with the baseline on the point index
    comparison_df = test_results.join(baseline_results, rsuffix='_baseline', how='inner')

    # Calculate the match rate
    match_rate = (comparison_df['polygon_index'] == comparison_df['polygon_index_baseline']).mean()
    
    print(f"Match Rate for {method_name}: {match_rate * 100}%")

In [27]:
evaluate_results(geohash_results, baseline_results, f"Geohash with precision 6")

Match Rate for Geohash with precision 6: 99.99882349480868%
