In [1]:
from itertools import product
import numpy as np
from pathlib import Path

from dask import delayed
import dask.dataframe as dd
import geopandas as gpd
import h3pandas
import pandas as pd

In [2]:
input_dir = Path('../data/output')
output_dir = Path('../data/output-joining')

d_output = output_dir / 'raster_combined-discrete.gpkg'
c_output = output_dir / 'raster_combined-continuous.gpkg'

d_files = list(input_dir.glob('discrete*.parquet'))
c_files = list(input_dir.glob('continuous*.parquet'))

N = 10000

In [3]:
@np.vectorize
def is_prime(n):
    if n % 2 == 0 and n > 2:
        return False
    return all(n % i for i in range(3, int(np.sqrt(n)) + 1, 2))

@np.vectorize
def is_polygonal(s, x):
    assert s > 2 and s % 1 == 0 and x % 1 == 0
    n = (np.sqrt(8 * (s - 2) * x + (s - 4) ** 2) + (s - 4)) / (2 * (s - 2))
    return n % 1 == 0

@np.vectorize
def is_fibonacci(n):
    a, b = 0,1
    while a < n:
        a, b = b, a + b
    return a == n

@np.vectorize
def is_perfect(n):
    sum = 1
    i = 2
    while i * i <= n:
        if n % i == 0:
            sum = sum + i + n/i
        i += 1
    return sum == n and n != 1

def is_triangular(n):
    return is_polygonal(3, n)

def is_rectangular(n):
    return is_polygonal(4, n)

def is_pentagonal(n):
    return is_polygonal(5, n)

def is_hexagonal(n):
    return is_polygonal(6, n)

funcs = [
    is_prime,
    is_triangular, is_rectangular, is_pentagonal, is_hexagonal,
    is_fibonacci,
    is_perfect
]


In [4]:
@delayed
def read_and_label(filename, label, meta_type=np.int32, scale=1):
    df = pd.read_parquet(filename)
    df = df.rename(columns={"value": label})
    df[label] = df[label].astype(meta_type)
    series = []
    scaled = (df[label]*scale).apply(np.floor).fillna(value=0)
    for func in funcs:
        series.append(scaled.apply(func).astype(int))
    df[label] = pd.concat(series, axis=1).sum(axis=1)
    return df

@delayed
def join(dfs):
    df = dfs[0]
    for _df in dfs[1:]:
        df = df.merge(_df, on='h3_12', how='outer')
    return df.sum(axis=1).to_frame().rename(columns={0: 'satisfaction'})


In [5]:
%%time
# Read, join and classify discrete parquet data
df = join([read_and_label(file, label, meta_type=np.int32) for file, label in zip(d_files[:N], range(0, N))]).compute()

df.h3.h3_to_geo_boundary().to_file(d_output, driver='GPKG', mode='w')


CPU times: user 2h 5min 51s, sys: 11min 52s, total: 2h 17min 43s
Wall time: 2h 15min 3s


In [None]:
%%time
df = join([read_and_label(file, label, meta_type=np.float32, scale=100) for file, label in zip(c_files[:N], range(0, N))]).compute()

df.h3.h3_to_geo_boundary().to_file(c_output, driver='GPKG', mode='w')