In [1]:
from itertools import product
import numpy as np
from pathlib import Path

from dask import delayed
import dask.dataframe as dd
import geopandas as gpd
import h3pandas
import pandas as pd

In [2]:
input_dir = Path('../data/output')
output_dir = Path('../data/output-joining')

d_output = output_dir / 'raster_combined-discrete.gpkg'
c_output = output_dir / 'raster_combined-continuous.gpkg'

d_files = list(input_dir.glob('discrete*.parquet'))
c_files = list(input_dir.glob('continuous*.parquet'))

In [3]:
@np.vectorize
def is_prime(n):
    if np.isnan(n):
        return False
    if n % 2 == 0 and n > 2:
        return False
    return all(n % i for i in range(3, int(np.sqrt(n)) + 1, 2))

@np.vectorize
def is_polygonal(s, x):
    if np.isnan(x):
        return 0
    assert s > 2 and s % 1 == 0 and x % 1 == 0
    n = (np.sqrt(8 * (s - 2) * x + (s - 4) ** 2) + (s - 4)) / (2 * (s - 2))
    return n % 1 == 0

@np.vectorize
def is_fibonacci(n):
    a, b = 0,1
    while a < n:
        a, b = b, a + b
    return a == n

@np.vectorize
def is_perfect(n):
    sum = 1
    i = 2
    while i * i <= n:
        if n % i == 0:
            sum = sum + i + n/i
        i += 1
    return sum == n and n != 1

def is_triangular(n):
    return is_polygonal(3, n)

def is_rectangular(n):
    return is_polygonal(4, n)

def is_pentagonal(n):
    return is_polygonal(5, n)

def is_hexagonal(n):
    return is_polygonal(6, n)

funcs = [
    is_prime,
    is_triangular, is_rectangular, is_pentagonal, is_hexagonal,
    is_fibonacci,
    is_perfect
]


In [8]:
N = 10 # None to do all
# TODO join performance with larger N is poor, I think there's a better way to do this

In [16]:
%%time
def read_and_label(filename):
    df = pd.read_parquet(filename)
    df = df.rename(columns={"value": f"value-{filename.stem.split('-')[-1]}"})
    return df

def join_and_classify(files, scale=1, meta_type='int32'):
    dfs = [delayed(read_and_label)(fname) for fname in files[:N]]
    ddf = dd.from_delayed(dfs[0].join(dfs[1:]))
    # ddf = dd.concat(dd.from_delayed(dfs).compute(), axis='index')
    
    out_df = pd.DataFrame(index=ddf.index)
    func_col_combos = list(product(funcs, ddf.columns))
    for func, col in func_col_combos:
        out_df[f'{col}.{func.__name__}'] = (ddf[col]*scale).apply(np.floor, meta=(col, meta_type)).apply(func, meta=(col, 'bool')).astype(int).compute()
    
    sum_cols = [f'{col}.{func.__name__}' for func, col in func_col_combos]
    out_df['satisfaction'] = out_df[sum_cols].sum(axis=1)
    return out_df.drop(columns=sum_cols)

CPU times: user 8 µs, sys: 1 µs, total: 9 µs
Wall time: 13.1 µs


In [17]:
%%time
out_df = join_and_classify(d_files)
h3_df = out_df.h3.h3_to_geo_boundary()
h3_df.to_file(d_output, driver='GPKG', mode='w')

CPU times: user 9.75 s, sys: 1.01 s, total: 10.8 s
Wall time: 9.44 s


In [19]:
%%time
out_df = join_and_classify(c_files)
h3_df = out_df.h3.h3_to_geo_boundary()
h3_df.to_file(c_output, driver='GPKG', mode='w', scale=100, meta_type='float32')

CPU times: user 9.89 s, sys: 1.01 s, total: 10.9 s
Wall time: 9.52 s
