In [1]:
from pathlib import Path
import numpy as np
import geopandas as gpd
import h3pandas
from joblib import Parallel, delayed
import pandas as pd
import rasterio
from shapely.geometry import Point


In [9]:
output_dir = Path('../data/output')
output_dir.parents[0].mkdir(parents=True, exist_ok=True)
discrete_dir = Path('../data/discrete')
continuous_dir = Path('../data/continuous')

N = 500 # None to do everything

In [14]:
def h3index_raster(file, output_dir, h3_res=12, stem=None, operation='max'):
    import h3pandas # https://github.com/DahnJ/H3-Pandas/issues/27
    with rasterio.open(file) as src:
        array = src.read(1)
        transform = src.transform
    array[np.isnan(array)] = 0
    height, width = array.shape
    cols, rows = np.meshgrid(np.arange(width), np.arange(height), indexing='xy')
    xs, ys = rasterio.transform.xy(transform, rows.flatten(), cols.flatten(), offset='center')
    dstack = np.dstack((xs, ys))[0]
    pts = [Point(*c) for c in dstack]
    df = gpd.GeoDataFrame(zip(array.flatten(), pts), columns=['value','geometry'], crs='EPSG:2193')
    df = df.to_crs(4326)
    df = df.h3.geo_to_h3_aggregate(h3_res, operation=operation, return_geometry=False)
    df.to_parquet(f'{output_dir}/{stem or ""}_{file.stem}.parquet', index=f'h3_{h3_res}')

mode = lambda x: pd.Series.mode(x)[0]

In [15]:
%%time

N = 10
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='discrete', operation=mode) for file in list(sorted(discrete_dir.glob('*.asc')))[:N])
pass

CPU times: user 168 ms, sys: 12.7 ms, total: 180 ms
Wall time: 1.62 s


In [16]:
%%time
N = 100
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='discrete', operation=mode) for file in list(sorted(discrete_dir.glob('*.asc')))[:N])
pass

CPU times: user 352 ms, sys: 8.31 ms, total: 360 ms
Wall time: 12.2 s


In [18]:
%%time
N = 1000
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='discrete', operation=mode) for file in list(sorted(discrete_dir.glob('*.asc')))[:N])
pass

CPU times: user 2.67 s, sys: 242 ms, total: 2.91 s
Wall time: 1min 59s


In [7]:
%%time
N = 10000
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='discrete', operation=mode) for file in list(sorted(discrete_dir.glob('*.asc')))[:N])
pass

CPU times: total: 24.2 s
Wall time: 7min 29s


In [8]:
%%time
N=10
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='continuous', operation='mean') for file in list(sorted(continuous_dir.glob('*.asc')))[:N])
pass

CPU times: total: 15.6 ms
Wall time: 401 ms


In [17]:
%%time
N=100
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='continuous', operation='mean') for file in list(sorted(continuous_dir.glob('*.asc')))[:N])
pass

CPU times: user 313 ms, sys: 20.5 ms, total: 334 ms
Wall time: 4.82 s


In [19]:
%%time
N=1000
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='continuous', operation='mean') for file in list(sorted(continuous_dir.glob('*.asc')))[:N])
pass

CPU times: user 2.04 s, sys: 76.8 ms, total: 2.12 s
Wall time: 50.7 s


In [11]:
%%time
N=10000
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='continuous', operation='mean') for file in list(sorted(continuous_dir.glob('*.asc')))[:N])
pass

CPU times: total: 7.91 s
Wall time: 3min 27s
