In [1]:
from numba import jit
import rasterio
from pathlib import Path
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from joblib import Parallel, delayed


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [4]:
def h3_index(gdf):
    import h3pandas
    h3=gdf.h3.geo_to_h3_aggregate(12, operation='mean')
    return h3

def process_file(file):
    with rasterio.open(file) as src:
        array = src.read(1)  # Read the first band
        transform = src.transform
        crs = src.crs  # Capture the CRS directly from the source

    # Generate full grid of coordinates
    cols, rows = array.shape
    col_indices, row_indices = np.meshgrid(np.arange(cols), np.arange(rows), indexing='xy')
    x_coords, y_coords = rasterio.transform.xy(transform, row_indices.flatten(), col_indices.flatten(), offset='center')
    
    # Flatten the array data to match the flat coordinate arrays
    values_flat = array.flatten()
    points = np.column_stack((x_coords, y_coords, values_flat))

    # Create GeoDataFrame
    geometry = [Point(x, y) for x, y, _ in points]
    df = gpd.GeoDataFrame(points, columns=['X', 'Y', 'Value'], geometry=geometry)
    df=df.drop(columns=['X','Y']).set_crs(2193)
    df=df.to_crs(4326)

    # H3 
    h3 = h3_index(df)
    h3.to_parquet(f'{output_dir}/{input_dir}_{file.stem}.parquet')

In [5]:
%%time
input_dir = Path('continuous')
output_dir = Path('combined_h3')
r_files = list(sorted(input_dir.glob('*.asc')))

# Using parallel processing Continuous
Parallel(n_jobs=-1)(delayed(process_file)(file) for file in r_files)
print('Done continuous!')

Done continuous!
CPU times: total: 2.28 s
Wall time: 1min 10s


In [6]:
%%time
input_dir = Path('discrete')
r_files = list(sorted(input_dir.glob('*.asc')))

# Using parallel processing Discrete
Parallel(n_jobs=-1)(delayed(process_file)(file) for file in r_files)
print('Done discrete!')

Done discrete!
CPU times: total: 2.3 s
Wall time: 1min 9s
