In [1]:
from pathlib import Path
import numpy as np

import geopandas as gpd
import h3pandas
from joblib import Parallel, delayed
import pandas as pd
import rasterio
from shapely.geometry import Point


In [2]:
output_dir = Path('../data/output')
output_dir.parents[0].mkdir(parents=True, exist_ok=True)
discrete_dir = Path('../data/discrete')
continuous_dir = Path('../data/continuous')

N = None # None to do everything

In [3]:
def h3index_raster(file, output_dir, h3_res=12, stem=None, operation='max'):
    import h3pandas # https://github.com/DahnJ/H3-Pandas/issues/27
    with rasterio.open(file) as src:
        array = src.read(1)
        transform = src.transform
    array[np.isnan(array)] = 0
    height, width = array.shape
    cols, rows = np.meshgrid(np.arange(width), np.arange(height), indexing='xy')
    xs, ys = rasterio.transform.xy(transform, rows.flatten(), cols.flatten(), offset='center')
    dstack = np.dstack((xs, ys))[0]
    pts = [Point(*c) for c in dstack]
    df = gpd.GeoDataFrame(zip(array.flatten(), pts), columns=['value','geometry'], crs='EPSG:2193')
    df = df.to_crs(4326)
    df = df.h3.geo_to_h3_aggregate(h3_res, operation=operation, return_geometry=False)
    df.to_parquet(f'{(output_dir)}/{stem or ''}_{(file.stem)}.parquet', index=f'h3_{h3_res}')

In [4]:
%%time
mode = lambda x: pd.Series.mode(x)[0]
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='discrete', operation=mode) for file in list(sorted(discrete_dir.glob('*.asc')))[:N])
pass

CPU times: user 30.9 s, sys: 2.02 s, total: 32.9 s
Wall time: 16min 40s


In [5]:
%%time
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='continuous', operation='mean') for file in list(sorted(discrete_dir.glob('*.asc')))[:N])
pass

CPU times: user 21.8 s, sys: 1.28 s, total: 23 s
Wall time: 7min 43s
