In [1]:
from numba import jit
import rasterio
from pathlib import Path
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from joblib import Parallel, delayed

In [2]:
output_dir = Path('../data/output')
output_dir.parents[0].mkdir(parents=True, exist_ok=True)
discrete_dir = Path('../data/discrete')
continuous_dir = Path('../data/continuous')

N = None # None to do everything

In [3]:
def h3index_raster(file, output_dir, h3_res=12, stem=None, operation='max'):
    import h3pandas # https://github.com/DahnJ/H3-Pandas/issues/27
    with rasterio.open(file) as src:
        array = src.read(1)
        transform = src.transform
    array[np.isnan(array)] = 0
    height, width = array.shape
    cols, rows = np.meshgrid(np.arange(width), np.arange(height), indexing='xy')
    xs, ys = rasterio.transform.xy(transform, rows.flatten(), cols.flatten(), offset='center')
    dstack = np.dstack((xs, ys))[0]
    pts = [Point(*c) for c in dstack]
    df = gpd.GeoDataFrame(zip(array.flatten(), pts), columns=['value','geometry'], crs='EPSG:2193')
    df = df.to_crs(4326)
    df = df.h3.geo_to_h3_aggregate(h3_res, operation=operation, return_geometry=False)
    df.to_parquet(f'{output_dir}/{stem}_{file.stem}.parquet', index=f'h3_{h3_res}')

In [4]:
%%time
mode = lambda x: pd.Series.mode(x)[0]
N= 10
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='discrete', operation=mode) for file in list(sorted(discrete_dir.glob('*.asc')))[:N])
pass

CPU times: total: 15.6 ms
Wall time: 2.15 s


In [5]:
%%time
mode = lambda x: pd.Series.mode(x)[0]
N= 100
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='discrete', operation=mode) for file in list(sorted(discrete_dir.glob('*.asc')))[:N])
pass

CPU times: total: 219 ms
Wall time: 6.08 s


In [6]:
%%time
mode = lambda x: pd.Series.mode(x)[0]
N= 1000
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='discrete', operation=mode) for file in list(sorted(discrete_dir.glob('*.asc')))[:N])
pass

CPU times: total: 2.78 s
Wall time: 53.2 s


In [7]:
%%time
mode = lambda x: pd.Series.mode(x)[0]
N= 10000
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='discrete', operation=mode) for file in list(sorted(discrete_dir.glob('*.asc')))[:N])
pass

CPU times: total: 25.2 s
Wall time: 7min 56s


In [8]:
%%time
mode = lambda x: pd.Series.mode(x)[0]
N= 10
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='continuous', operation='mean') for file in list(sorted(continuous_dir.glob('*.asc')))[:N])
pass

CPU times: total: 0 ns
Wall time: 414 ms


In [9]:
%%time
mode = lambda x: pd.Series.mode(x)[0]
N= 100
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='continuous', operation='mean') for file in list(sorted(continuous_dir.glob('*.asc')))[:N])
pass

CPU times: total: 188 ms
Wall time: 2.45 s


In [10]:
%%time
mode = lambda x: pd.Series.mode(x)[0]
N= 1000
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='continuous', operation='mean') for file in list(sorted(continuous_dir.glob('*.asc')))[:N])
pass

CPU times: total: 1.53 s
Wall time: 21.5 s


In [11]:
%%time
mode = lambda x: pd.Series.mode(x)[0]
N= 10000
Parallel(n_jobs=-1)(delayed(h3index_raster)(file, output_dir, stem='continuous', operation='mean') for file in list(sorted(continuous_dir.glob('*.asc')))[:N])
pass

CPU times: total: 19 s
Wall time: 3min 49s
