Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,7 @@ def calculate_weighted_block_to_block_distances(self,
lon_col_name=point_support.lon_col_name,
lat_col_name=point_support.lat_col_name,
val_col_name=point_support.value_column_name,
block_id_col_name=point_support.point_support_blocks_index_name,
verbose=self.verbose
block_id_col_name=point_support.point_support_blocks_index_name
)

# block_distances : pandas DataFrame
Expand Down
314 changes: 177 additions & 137 deletions src/pyinterpolate/distance/block.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,92 +10,141 @@
import geopandas as gpd
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.spatial.distance import cdist


def _calc_b2b_dist_from_dataframe(
ps_blocks: Union[pd.DataFrame, gpd.GeoDataFrame],
lon_col_name: Union[str, Hashable],
lat_col_name: Union[str, Hashable],
val_col_name: Union[str, Hashable],
block_id_col_name: Union[str, Hashable],
verbose=False
block_id_col_name: Union[str, Hashable]
) -> pd.DataFrame:
r"""
Function calculates distances between the blocks' point supports.

Parameters
----------
ps_blocks : Union[pd.DataFrame, gpd.GeoDataFrame]
DataFrame with point supports and block indexes.

lon_col_name : Union[str, Hashable]
Longitude or x coordinate.

lat_col_name : Union[str, Hashable]
Latitude or y coordinate.

val_col_name : Union[str, Hashable]
The point support values column.

block_id_col_name : Union[str, Hashable]
Column with block names / indexes.

verbose : bool, default = False
Show progress bar.

Returns
-------
block_distances : DataFrame
Indexes and columns are block indexes, cells are distances.

"""
calculated_pairs = set()
unique_blocks = list(ps_blocks[block_id_col_name].unique())

col_set = [lon_col_name, lat_col_name, val_col_name]

results = []

for block_i in tqdm(unique_blocks, disable=not verbose):
for block_j in unique_blocks:
# Check if it was estimated
if not (block_i, block_j) in calculated_pairs:
if block_i == block_j:
results.append([block_i, block_j, 0])
else:
i_value = ps_blocks[
ps_blocks[block_id_col_name] == block_i
]
j_value = ps_blocks[
ps_blocks[block_id_col_name] == block_j
]
value = _calculate_block_to_block_distance(
i_value[col_set].to_numpy(),
j_value[col_set].to_numpy()
)
results.append([block_i, block_j, value])
results.append([block_j, block_i, value])
calculated_pairs.add((block_i, block_j))
calculated_pairs.add((block_j, block_i))

# Create output dataframe
df = pd.DataFrame(data=results, columns=['block_i', 'block_j', 'z'])
df = df.pivot_table(
values='z',
index='block_i',
columns='block_j'
"""Fully vectorized version - fastest approach."""

unique_blocks = ps_blocks[block_id_col_name].unique()
n_blocks = len(unique_blocks)

# Pre-compute block data once
block_data = {}
for block_id in unique_blocks:
mask = ps_blocks[block_id_col_name] == block_id
coords = ps_blocks.loc[mask, [lon_col_name, lat_col_name]].values
weights = ps_blocks.loc[mask, val_col_name].values
block_data[block_id] = (coords, weights)

# Initialize distance matrix
distance_matrix = np.zeros((n_blocks, n_blocks))

# Compute distances for upper triangle only
for i in range(n_blocks):
for j in range(i, n_blocks):
if i == j:
distance_matrix[i, j] = 0.0
else:
coords_i, weights_i = block_data[unique_blocks[i]]
coords_j, weights_j = block_data[unique_blocks[j]]

# Use scipy's cdist for pairwise distances
dist_matrix = cdist(coords_i, coords_j)
weight_matrix = np.outer(weights_i, weights_j)

distance = np.sum(dist_matrix * weight_matrix) / np.sum(
weight_matrix)
distance_matrix[i, j] = distance
distance_matrix[j, i] = distance # Symmetric

# Create DataFrame
return pd.DataFrame(
distance_matrix,
index=unique_blocks,
columns=unique_blocks
)

# sort
df = df.reindex(columns=unique_blocks)
df = df.reindex(index=unique_blocks)

return df
# def _calc_b2b_dist_from_dataframe(
# ps_blocks: Union[pd.DataFrame, gpd.GeoDataFrame],
# lon_col_name: Union[str, Hashable],
# lat_col_name: Union[str, Hashable],
# val_col_name: Union[str, Hashable],
# block_id_col_name: Union[str, Hashable],
# verbose=False
# ) -> pd.DataFrame:
# r"""
# Function calculates distances between the blocks' point supports.
#
# Parameters
# ----------
# ps_blocks : Union[pd.DataFrame, gpd.GeoDataFrame]
# DataFrame with point supports and block indexes.
#
# lon_col_name : Union[str, Hashable]
# Longitude or x coordinate.
#
# lat_col_name : Union[str, Hashable]
# Latitude or y coordinate.
#
# val_col_name : Union[str, Hashable]
# The point support values column.
#
# block_id_col_name : Union[str, Hashable]
# Column with block names / indexes.
#
# verbose : bool, default = False
# Show progress bar.
#
# Returns
# -------
# block_distances : DataFrame
# Indexes and columns are block indexes, cells are distances.
#
# """
# calculated_pairs = set()
# unique_blocks = list(ps_blocks[block_id_col_name].unique())
#
# col_set = [lon_col_name, lat_col_name, val_col_name]
#
# results = []
#
# for block_i in tqdm(unique_blocks, disable=not verbose):
# for block_j in unique_blocks:
# # Check if it was estimated
# if not (block_i, block_j) in calculated_pairs:
# if block_i == block_j:
# results.append([block_i, block_j, 0])
# else:
# i_value = ps_blocks[
# ps_blocks[block_id_col_name] == block_i
# ]
# j_value = ps_blocks[
# ps_blocks[block_id_col_name] == block_j
# ]
# value = _calculate_block_to_block_distance(
# i_value[col_set].to_numpy(),
# j_value[col_set].to_numpy()
# )
# results.append([block_i, block_j, value])
# results.append([block_j, block_i, value])
# calculated_pairs.add((block_i, block_j))
# calculated_pairs.add((block_j, block_i))
#
# # Create output dataframe
# df = pd.DataFrame(data=results, columns=['block_i', 'block_j', 'z'])
# df = df.pivot_table(
# values='z',
# index='block_i',
# columns='block_j'
# )
#
# # sort
# df = df.reindex(columns=unique_blocks)
# df = df.reindex(index=unique_blocks)
#
# return df


# noinspection PyUnresolvedReferences
def _calc_b2b_dist_from_ps(ps_blocks: 'PointSupport', verbose=False) -> Dict:
def _calc_b2b_dist_from_ps(ps_blocks: 'PointSupport') -> Dict:
r"""
Function calculates distances between the blocks' point supports.

Expand All @@ -105,9 +154,6 @@ def _calc_b2b_dist_from_ps(ps_blocks: 'PointSupport', verbose=False) -> Dict:
PointSupport object with parsed blocks and their respective point
supports.

verbose : bool, default = False
Show progress bar.

Returns
-------
block_distances : DataFrame
Expand All @@ -118,8 +164,7 @@ def _calc_b2b_dist_from_ps(ps_blocks: 'PointSupport', verbose=False) -> Dict:
lon_col_name=ps_blocks.lon_col_name,
lat_col_name=ps_blocks.lat_col_name,
val_col_name=ps_blocks.value_column_name,
block_id_col_name=ps_blocks.point_support_blocks_index_name,
verbose=verbose
block_id_col_name=ps_blocks.point_support_blocks_index_name
)

return block_distances
Expand All @@ -131,8 +176,7 @@ def calc_block_to_block_distance(
lon_col_name=None,
lat_col_name=None,
val_col_name=None,
block_id_col_name=None,
verbose=False
block_id_col_name=None
) -> pd.DataFrame:
r"""
Function calculates distances between the blocks' point supports.
Expand All @@ -158,9 +202,6 @@ def calc_block_to_block_distance(
block_id_col_name : optional
Column with block names / indexes.

verbose : bool, default = False
Show progress bar.

Returns
-------
block_distances : DataFrame
Expand Down Expand Up @@ -189,67 +230,66 @@ def calc_block_to_block_distance(
lon_col_name=lon_col_name,
lat_col_name=lat_col_name,
val_col_name=val_col_name,
block_id_col_name=block_id_col_name,
verbose=verbose
block_id_col_name=block_id_col_name
)
else:
block_distances = _calc_b2b_dist_from_ps(ps_blocks, verbose=verbose)
block_distances = _calc_b2b_dist_from_ps(ps_blocks)

return block_distances


def _calculate_block_to_block_distance(ps_block_1: np.ndarray,
ps_block_2: np.ndarray) -> float:
r"""
Function calculates distance between two blocks' point supports.

Parameters
----------
ps_block_1 : numpy array
Point support of the first block.

ps_block_2 : numpy array
Point support of the second block.

Returns
-------
weighted_distances : float
Weighted point-support distance between blocks.

Notes
-----
The weighted distance between blocks is derived from the equation given
in publication [1] from References. This distance is weighted by

References
----------
.. [1] Goovaerts, P. Kriging and Semivariogram Deconvolution in the
Presence of Irregular Geographical Units.
Math Geosci 40, 101–128 (2008).
https://doi.org/10.1007/s11004-007-9129-1

TODO
----
* Add reference equation to the special part of the documentation.
"""

a_shape = ps_block_1.shape[0]
b_shape = ps_block_2.shape[0]
ax = ps_block_1[:, 0].reshape(1, a_shape)
bx = ps_block_2[:, 0].reshape(b_shape, 1)
dx = ax - bx
ay = ps_block_1[:, 1].reshape(1, a_shape)
by = ps_block_2[:, 1].reshape(b_shape, 1)
dy = ay - by
aval = ps_block_1[:, -1].reshape(1, a_shape)
bval = ps_block_2[:, -1].reshape(b_shape, 1)
w = aval * bval

dist = np.sqrt(dx ** 2 + dy ** 2, dtype=float, casting='unsafe')

wdist = dist * w
distances_sum = np.sum(wdist) / np.sum(w)
return distances_sum
# def _calculate_block_to_block_distance(ps_block_1: np.ndarray,
# ps_block_2: np.ndarray) -> float:
# r"""
# Function calculates distance between two blocks' point supports.
#
# Parameters
# ----------
# ps_block_1 : numpy array
# Point support of the first block.
#
# ps_block_2 : numpy array
# Point support of the second block.
#
# Returns
# -------
# weighted_distances : float
# Weighted point-support distance between blocks.
#
# Notes
# -----
# The weighted distance between blocks is derived from the equation given
# in publication [1] from References. This distance is weighted by
#
# References
# ----------
# .. [1] Goovaerts, P. Kriging and Semivariogram Deconvolution in the
# Presence of Irregular Geographical Units.
# Math Geosci 40, 101–128 (2008).
# https://doi.org/10.1007/s11004-007-9129-1
#
# TODO
# ----
# * Add reference equation to the special part of the documentation.
# """
#
# a_shape = ps_block_1.shape[0]
# b_shape = ps_block_2.shape[0]
# ax = ps_block_1[:, 0].reshape(1, a_shape)
# bx = ps_block_2[:, 0].reshape(b_shape, 1)
# dx = ax - bx
# ay = ps_block_1[:, 1].reshape(1, a_shape)
# by = ps_block_2[:, 1].reshape(b_shape, 1)
# dy = ay - by
# aval = ps_block_1[:, -1].reshape(1, a_shape)
# bval = ps_block_2[:, -1].reshape(b_shape, 1)
# w = aval * bval
#
# dist = np.sqrt(dx ** 2 + dy ** 2, dtype=float, casting='unsafe')
#
# wdist = dist * w
# distances_sum = np.sum(wdist) / np.sum(w)
# return distances_sum


def select_neighbors_in_range(data: pd.DataFrame,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_distance/test_block_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ def test_block_to_block_distance():
for k in distances.index:
assert k in PS.unique_blocks
for k, v in distances_from_gdf.items():
assert np.sum(v) == np.sum(distances[k])
assert np.allclose(np.sum(v), np.sum(distances[k]), rtol=1e-5)