From bb74373e86a3a5d8ec2c086d9147d7d73cc788fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlson=20Moses=20B=C3=BCth?= Date: Tue, 23 May 2023 11:50:06 +0200 Subject: [PATCH 1/4] Add high betweenness clustering coefficient --- superblockify/config.py | 9 +++- superblockify/metrics/measures.py | 89 +++++++++++++++++++++++++++++++ superblockify/metrics/metric.py | 55 ++++++++++++++++--- tests/metrics/test_measures.py | 24 +++++++++ tests/metrics/test_metric.py | 10 ++++ 5 files changed, 179 insertions(+), 8 deletions(-) diff --git a/superblockify/config.py b/superblockify/config.py index c988491..1f9394f 100644 --- a/superblockify/config.py +++ b/superblockify/config.py @@ -24,6 +24,10 @@ The filter used to filter the OSM data for the graph. This is a string that is passed to the :func:`osmnx.graph_from_place` function. +CLUSTERING_PERCENTILE + The percentile used to determine the betweenness centrality threshold for the + spatial clustering and anisotropy nodes. + logger The logger for this module. This is used to log information, warnings and errors throughout the package. @@ -72,6 +76,9 @@ '["service"!~"alley|driveway|emergency_access|parking|parking_aisle|private"]' ) +# Metrics +CLUSTERING_PERCENTILE = 0.9 + # Logging configuration using the setup.cfg file logging.config.fileConfig(join(dirname(__file__), "..", "setup.cfg")) # Get the logger for this module @@ -79,7 +86,7 @@ # Tests TEST_DATA_PATH = "./tests/test_data/" -HIDE_PLOTS = False +HIDE_PLOTS = True PLACES_GENERAL = [ ("Barcelona", "Barcelona, Catalonia, Spain"), diff --git a/superblockify/metrics/measures.py b/superblockify/metrics/measures.py index 4d4e82b..b373227 100644 --- a/superblockify/metrics/measures.py +++ b/superblockify/metrics/measures.py @@ -589,3 +589,92 @@ def _sum_bc(loop_indices, pred, dist, edges_uv, edge_padding): # pragma: no cov edge_padding, ) return betweennesses + + +def calculate_high_bc_clustering(node_x, node_y, node_betweenness, percentile): + """ + Calculate the high betweenness clustering coefficient and anisotropy for a + given percentile of nodes with the highest betweenness. [1]_ + + Parameters + ---------- + node_x : list + List of x coordinates of the nodes. + node_y : list + List of y coordinates of the nodes, ordered by node index. + node_betweenness : list + List of betweenness values for each node, ordered by node index. + percentile : float + Percentile of nodes with the highest betweenness to calculate the + clustering coefficient for. Between 0 and 1. + + Returns + ------- + high_bc_clustering : float + Clustering coefficient for the nodes with the highest betweenness. + high_bc_anisotropy : float + Anisotropy for the nodes with the highest betweenness. + + Notes + ----- + The high betweenness clustering coefficient is calculated as the average + clustering coefficient of the nodes with the highest betweenness. The + high betweenness anisotropy is calculated as the average anisotropy of the + nodes with the highest betweenness. + + References + ---------- + .. [1] Kirkley, A., Barbosa, H., Barthelemy, M. & Ghoshal, G. From the betweenness + centrality in street networks to structural invariants in random planar + graphs. Nat Commun 9, 2501 (2018). + https://www.nature.com/articles/s41467-018-04978-z + """ + coord_bc = np.array([node_x, node_y, node_betweenness]).T + # Sort by betweenness + coord_bc = coord_bc[coord_bc[:, 2].argsort()] + # Threshold betweenness + threshold_idx = int(len(coord_bc) * percentile) + return __calculate_high_bc_clustering(coord_bc, threshold_idx), None + + +def __calculate_high_bc_clustering(coord_bc, threshold_idx): + r"""High betweenness nodes clustering coefficient. + + .. math:: + C_{\theta} = + \frac{1}{N_{\theta}\left\langle X \right\rangle} + \sum_{i = 1}^{N_{\theta}} \| x_i - x_{\mathrm{cm}, \theta} \| + + .. math:: + \langle X \rangle = \frac{1}{N} + \sum_{i = 1}^{N} \| x_i - x_{\mathrm{cm}, \theta} \| + + .. math:: + x_{\mathrm{cm}, \theta} = \frac{1}{N_{\theta}} + \sum_{i = 1}^{N_{\theta}} x_i + + The distance calculation :math:`\| x_i - x_{\mathrm{cm}, \theta} \|` includes the + x and y coordinates of the node, and is the Euclidean distance. In this case it + is the Frobenius norm of the difference between the node coordinates and the + center of mass of the high betweenness nodes. + + Parameters + ---------- + coord_bc : np.ndarray + Array of node coordinates and betweenness values, sorted by betweenness. + threshold_idx : int + Index of the first node to consider as high betweenness. + + Returns + ------- + high_bc_clustering : float + Clustering coefficient for the nodes with the highest betweenness. + """ + # Center of mass of high betweenness nodes + high_bc_cm = np.mean(coord_bc[threshold_idx:, :2], axis=0) + # Average distance to center of mass + avg_dist = np.mean( + np.linalg.norm(coord_bc[threshold_idx:, :2] - high_bc_cm, axis=1) + ) + # Norm by average distance of all nodes + return avg_dist / np.mean(np.linalg.norm(coord_bc[:, :2] - high_bc_cm, axis=1)) diff --git a/superblockify/metrics/metric.py b/superblockify/metrics/metric.py index cc3df5d..0888c8f 100644 --- a/superblockify/metrics/metric.py +++ b/superblockify/metrics/metric.py @@ -13,6 +13,7 @@ write_relative_increase_to_edges, calculate_coverage, betweenness_centrality, + calculate_high_bc_clustering, ) from .plot import ( plot_distance_matrices, @@ -21,7 +22,7 @@ plot_relative_difference, plot_relative_increase_on_graph, ) -from ..config import logger, RESULTS_DIR +from ..config import logger, RESULTS_DIR, CLUSTERING_PERCENTILE from ..plot import save_plot from ..utils import compare_dicts @@ -102,6 +103,8 @@ def __init__(self, unit="time"): self.avg_path_length = {"S": None, "N": None} self.directness = {"SN": None} self.global_efficiency = {"NS": None} + self.high_bc_clustering = None + self.high_bc_anisotropy = None self.distance_matrix = {} self.predecessor_matrix = {} @@ -176,6 +179,8 @@ def calculate_before(self, partitioner, make_plots=False): # No `attr_suffix` for the full graph ) + self.calculate_high_bc_clustering(partitioner.graph, CLUSTERING_PERCENTILE) + def calculate_all( self, partitioner, @@ -241,12 +246,6 @@ def calculate_all( plot_distributions=make_plots, ) - self.calculate_all_measure_sums() - - write_relative_increase_to_edges( - partitioner.graph, self.distance_matrix, self.node_list, "N", "S" - ) - betweenness_centrality( partitioner.graph, self.node_list, @@ -256,6 +255,12 @@ def calculate_all( attr_suffix="_restricted", ) + self.calculate_all_measure_sums() + + write_relative_increase_to_edges( + partitioner.graph, self.distance_matrix, self.node_list, "N", "S" + ) + if make_plots: # sort distance matrix dictionaries to follow start with E, S, N, ... d_m = self.distance_matrix @@ -347,6 +352,42 @@ def calculate_all_measure_sums(self): ) logger.debug("Global efficiency %s: %s", key, self.global_efficiency[key]) + def calculate_high_bc_clustering(self, graph, percentile): + """Calculate the high betweenness node clustering and anisotropy. + + High betweenness nodes are the nodes above the given percentile of the + betweenness centrality distribution. + + Parameters + ---------- + graph : networkx.Graph + The graph to calculate the high betweenness node clustering for, needs to + have x, y, and node_betweenness_normal attribute for each node. + percentile : float + The percentile of the betweenness centrality to use as a threshold for high + betweenness nodes. 0.0 < percentile < 100.0. + + Raises + ------ + ValueError + If percentile is not a float between 0.0 and 100.0. + """ + if not isinstance(percentile, float): + raise ValueError(f"percentile needs to be a float, not {type(percentile)}") + if not 0.0 < percentile < 100.0: + raise ValueError( + f"percentile needs to be between 0.0 and 100.0, not {percentile}" + ) + + self.high_bc_clustering, self.high_bc_anisotropy = calculate_high_bc_clustering( + node_x=[graph.nodes[node]["x"] for node in self.node_list], + node_y=[graph.nodes[node]["y"] for node in self.node_list], + node_betweenness=[ + graph.nodes[node]["node_betweenness_normal"] for node in self.node_list + ], + percentile=percentile / 100, + ) + def __str__(self): """Return a string representation of the metric object. diff --git a/tests/metrics/test_measures.py b/tests/metrics/test_measures.py index 1547214..b871782 100644 --- a/tests/metrics/test_measures.py +++ b/tests/metrics/test_measures.py @@ -16,6 +16,7 @@ wheel_graph, ) from numpy import full, array, inf, array_equal, int32, int64, allclose +from numpy.random import default_rng from scipy.sparse.csgraph import dijkstra from superblockify.metrics.measures import ( @@ -25,6 +26,7 @@ calculate_coverage, betweenness_centrality, _calculate_betweenness, + __calculate_high_bc_clustering, ) from superblockify.utils import __edges_to_1d @@ -648,3 +650,25 @@ def test_betweenness_centrality_weight_missing(graph): del graph.edges[0, 1, 0]["weight"] with pytest.raises(ValueError): betweenness_centrality(graph, None, None, None, weight="weight") + + +@pytest.mark.parametrize("length", [10, 100, 1000, 60000]) +def test___calculate_high_bc_clustering(length): + """Test calculation of betweenness centrality clustering.""" + rng = default_rng(29384) + coord_bc = array( + [ + ( # x-coord + rng.uniform(low=-10, high=10, size=length) + + rng.uniform(low=-180, high=180) + ), + ( # y-coord + rng.uniform(low=-10, high=10, size=length) + + rng.uniform(low=-90, high=90) + ), # betweenness centrality + rng.uniform(low=0, high=1, size=length), + ] + ).T + coord_bc = coord_bc[coord_bc[:, 2].argsort()] + threshold_idx = rng.integers(low=0, high=length) + assert 0.0 < __calculate_high_bc_clustering(coord_bc, threshold_idx) < 1.0 diff --git a/tests/metrics/test_metric.py b/tests/metrics/test_metric.py index 30584b4..5ff0eec 100644 --- a/tests/metrics/test_metric.py +++ b/tests/metrics/test_metric.py @@ -125,6 +125,16 @@ def test_calculate_metrics_before(self, test_one_city_precalculated_copy, unit): == part.graph.number_of_edges() ) + @pytest.mark.parametrize("percentile", [0.0, 0, 100.0, 100, -1.0, 101.0, None, "p"]) + def test_calculate_high_bc_clustering_faulty_percentile( + self, test_one_city_preloaded_copy, percentile + ): + """Test calculate_high_bc_clustering with faulty percentile. + Not 0.0 < percentile < 100.0.""" + part = test_one_city_preloaded_copy + with pytest.raises(ValueError): + part.metric.calculate_high_bc_clustering(part.graph, percentile=percentile) + @mark_xfail_flaky_download def test_saving_and_loading( self, From 1754fd77c353a5ed420b6a2a5bfe3bfce5c08576 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlson=20Moses=20B=C3=BCth?= Date: Tue, 23 May 2023 14:17:41 +0200 Subject: [PATCH 2/4] Add anisotropy of high betweenness nodes Also added exceptions to clustering calculation, and tests for all. --- superblockify/metrics/measures.py | 60 ++++++++++++++++++++++++- tests/metrics/test_measures.py | 75 ++++++++++++++++++++++++++----- 2 files changed, 124 insertions(+), 11 deletions(-) diff --git a/superblockify/metrics/measures.py b/superblockify/metrics/measures.py index b373227..4410150 100644 --- a/superblockify/metrics/measures.py +++ b/superblockify/metrics/measures.py @@ -634,7 +634,9 @@ def calculate_high_bc_clustering(node_x, node_y, node_betweenness, percentile): coord_bc = coord_bc[coord_bc[:, 2].argsort()] # Threshold betweenness threshold_idx = int(len(coord_bc) * percentile) - return __calculate_high_bc_clustering(coord_bc, threshold_idx), None + return __calculate_high_bc_clustering( + coord_bc, threshold_idx + ), __calculate_high_bc_anisotropy(coord_bc[threshold_idx:, :2]) def __calculate_high_bc_clustering(coord_bc, threshold_idx): @@ -669,7 +671,18 @@ def __calculate_high_bc_clustering(coord_bc, threshold_idx): ------- high_bc_clustering : float Clustering coefficient for the nodes with the highest betweenness. + + Raises + ------ + ValueError + If the coordinate array is has less than two nodes. + ValueError + If the threshold index is greater than the number of nodes. """ + if len(coord_bc) < 2: + raise ValueError("Coordinate array must have at least two nodes.") + if threshold_idx >= len(coord_bc): + raise ValueError("Threshold index must be less than the number of nodes.") # Center of mass of high betweenness nodes high_bc_cm = np.mean(coord_bc[threshold_idx:, :2], axis=0) # Average distance to center of mass @@ -678,3 +691,48 @@ def __calculate_high_bc_clustering(coord_bc, threshold_idx): ) # Norm by average distance of all nodes return avg_dist / np.mean(np.linalg.norm(coord_bc[:, :2] - high_bc_cm, axis=1)) + + +def __calculate_high_bc_anisotropy(coord_high_bc): + r"""High betweenness nodes anisotropy. + + The high betweenness anisotropy is the ratio + :math:`A_{\theta}=\lambda_1/\lambda_2`, where :math:`\lambda_i` are the positive + eigenvalues of the covariance matrix of the high betweenness nodes, and + :math:`\lambda_1 \geq \lambda_2`. [1]_ + + Parameters + ---------- + coord_high_bc : np.ndarray + Array of node coordinates of the high betweenness nodes. + + Returns + ------- + high_bc_anisotropy : float + Anisotropy for the nodes with the highest betweenness. + + Raises + ------ + ValueError + If the number of high betweenness nodes is less than 2. + + References + ---------- + .. [1] Kirkley, A., Barbosa, H., Barthelemy, M. & Ghoshal, G. From the betweenness + centrality in street networks to structural invariants in random planar + graphs. Nat Commun 9, 2501 (2018). + https://www.nature.com/articles/s41467-018-04978-z + """ + if len(coord_high_bc) < 2: + raise ValueError( + "High betweenness nodes must be at least 2, for less the anisotropy is " + "not defined." + ) + # Covariance matrix + cov = np.cov(coord_high_bc.T) + # Eigenvalues + eigvals = np.linalg.eigvals(cov) + # Sort eigenvalues + eigvals = np.sort(eigvals)[::-1] + # Anisotropy + return eigvals[0] / eigvals[1] diff --git a/tests/metrics/test_measures.py b/tests/metrics/test_measures.py index b871782..e94dee3 100644 --- a/tests/metrics/test_measures.py +++ b/tests/metrics/test_measures.py @@ -27,6 +27,7 @@ betweenness_centrality, _calculate_betweenness, __calculate_high_bc_clustering, + __calculate_high_bc_anisotropy, ) from superblockify.utils import __edges_to_1d @@ -652,23 +653,77 @@ def test_betweenness_centrality_weight_missing(graph): betweenness_centrality(graph, None, None, None, weight="weight") -@pytest.mark.parametrize("length", [10, 100, 1000, 60000]) -def test___calculate_high_bc_clustering(length): - """Test calculation of betweenness centrality clustering.""" +@pytest.fixture(scope="module", params=[10, 100, 1000, 60000]) +def clustering_data(request): + """Generate random data for clustering tests.""" rng = default_rng(29384) - coord_bc = array( + coord = array( [ ( # x-coord - rng.uniform(low=-10, high=10, size=length) + rng.uniform(low=-10, high=10, size=request.param) + rng.uniform(low=-180, high=180) ), ( # y-coord - rng.uniform(low=-10, high=10, size=length) + rng.uniform(low=-10, high=10, size=request.param) + rng.uniform(low=-90, high=90) ), # betweenness centrality - rng.uniform(low=0, high=1, size=length), + rng.uniform(low=0, high=1, size=request.param), ] ).T - coord_bc = coord_bc[coord_bc[:, 2].argsort()] - threshold_idx = rng.integers(low=0, high=length) - assert 0.0 < __calculate_high_bc_clustering(coord_bc, threshold_idx) < 1.0 + return coord[coord[:, 2].argsort()], rng.integers(low=0, high=request.param) + + +def test___calculate_high_bc_clustering( + clustering_data, +): # pylint: disable=redefined-outer-name + """Test calculation of betweenness centrality clustering.""" + assert 0.0 < __calculate_high_bc_clustering(*clustering_data) < 1.0 + + +@pytest.mark.parametrize( + "coord_bc,threshold_idx", + [ + (array([]), 0), # length 0 + (array([[0, 0, 0]]), 1), # length 1 + (array([[0, 0, 0], [1, 1, 1]]), 2), # index out of bounds + ], +) +def test___calculate_high_bc_clustering_faulty(coord_bc, threshold_idx): + """Test error catching for betweenness centrality clustering.""" + with pytest.raises(ValueError): + __calculate_high_bc_clustering(coord_bc, threshold_idx) + + +def test___calculate_high_bc_anisotropy(clustering_data): # pylint: disable=redefined-outer-name + """Test calculation of betweenness centrality anisotropy.""" + coord_high_bc = clustering_data[0][clustering_data[1] :, :2] + anisotropy = __calculate_high_bc_anisotropy(coord_high_bc) + assert 1.0 <= anisotropy + # check invariance to x and y coordinate swap + assert __calculate_high_bc_anisotropy(coord_high_bc[:, ::-1]) == anisotropy + + +@pytest.mark.parametrize( + "coords,expected", + [ + ([[0, 0], [1, 0], [0, 1], [1, 1]], 1.0), # square, round distribution + ([[-20, 10], [-10, 10], [-20, 20], [-10, 20]], 1.0), # square, round distr. + ([[1, 0], [0, 1], [1, 2], [2, 1]], 1.0), # diamond, round distribution + ([[0, 0], [1, 0], [0, 1], [1, 1], [0.5, 0.5]], 1.0), # square + center + ([[0, 0], [1, 0], [0, 2], [1, 2]], 4.0), # 2:1 rectangle, long distr. + ([[0, 0], [2, 0], [0, 1], [2, 1]], 4.0), # 1:2 rectangle, long distr. + ([[0, 0], [1, 0], [0, 2], [1, 2], [0.5, 1]], 4.0), # 2:1 rect. + center + ([[0, 0], [0, 1]], inf), # vertical line, infinite anisotropy + ([[0, 0], [1, 0]], inf), # horizontal line, infinite anisotropy + ], +) +def test___calculate_high_bc_anisotropy_special_cases(coords, expected): + """Test calculation of betweenness centrality anisotropy.""" + assert __calculate_high_bc_anisotropy(array(coords)) == expected + + +@pytest.mark.parametrize("coords", [[], [[0, 0]], [[1, 1]]]) +def test___test___calculate_high_bc_anisotropy_faulty(coords): + """Test error catching of betweenness centrality anisotropy.""" + with pytest.raises(ValueError): + __calculate_high_bc_anisotropy(array(coords)) From c6e11e657ea7c482a31b92323e683bf986f69cbd Mon Sep 17 00:00:00 2001 From: cbueth Date: Tue, 23 May 2023 12:18:21 +0000 Subject: [PATCH 3/4] Fix code style issues with Black --- tests/metrics/test_measures.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/metrics/test_measures.py b/tests/metrics/test_measures.py index e94dee3..dcdb74a 100644 --- a/tests/metrics/test_measures.py +++ b/tests/metrics/test_measures.py @@ -694,7 +694,9 @@ def test___calculate_high_bc_clustering_faulty(coord_bc, threshold_idx): __calculate_high_bc_clustering(coord_bc, threshold_idx) -def test___calculate_high_bc_anisotropy(clustering_data): # pylint: disable=redefined-outer-name +def test___calculate_high_bc_anisotropy( + clustering_data, +): # pylint: disable=redefined-outer-name """Test calculation of betweenness centrality anisotropy.""" coord_high_bc = clustering_data[0][clustering_data[1] :, :2] anisotropy = __calculate_high_bc_anisotropy(coord_high_bc) From 02db0de4f0c276c26636da6873a654a5ec8ea72a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlson=20Moses=20B=C3=BCth?= Date: Tue, 23 May 2023 14:49:00 +0200 Subject: [PATCH 4/4] Add plausibility checks for clustering and anisotropy --- superblockify/config.py | 2 +- superblockify/metrics/metric.py | 8 +++++--- tests/metrics/test_metric.py | 4 ++++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/superblockify/config.py b/superblockify/config.py index 1f9394f..a38e929 100644 --- a/superblockify/config.py +++ b/superblockify/config.py @@ -77,7 +77,7 @@ ) # Metrics -CLUSTERING_PERCENTILE = 0.9 +CLUSTERING_PERCENTILE = 90 # Logging configuration using the setup.cfg file logging.config.fileConfig(join(dirname(__file__), "..", "setup.cfg")) diff --git a/superblockify/metrics/metric.py b/superblockify/metrics/metric.py index 0888c8f..b2c674b 100644 --- a/superblockify/metrics/metric.py +++ b/superblockify/metrics/metric.py @@ -363,7 +363,7 @@ def calculate_high_bc_clustering(self, graph, percentile): graph : networkx.Graph The graph to calculate the high betweenness node clustering for, needs to have x, y, and node_betweenness_normal attribute for each node. - percentile : float + percentile : float or int The percentile of the betweenness centrality to use as a threshold for high betweenness nodes. 0.0 < percentile < 100.0. @@ -372,8 +372,10 @@ def calculate_high_bc_clustering(self, graph, percentile): ValueError If percentile is not a float between 0.0 and 100.0. """ - if not isinstance(percentile, float): - raise ValueError(f"percentile needs to be a float, not {type(percentile)}") + if not isinstance(percentile, (float, int)): + raise ValueError( + f"percentile needs to be a float or int, not {type(percentile)}" + ) if not 0.0 < percentile < 100.0: raise ValueError( f"percentile needs to be between 0.0 and 100.0, not {percentile}" diff --git a/tests/metrics/test_metric.py b/tests/metrics/test_metric.py index 5ff0eec..fa57653 100644 --- a/tests/metrics/test_metric.py +++ b/tests/metrics/test_metric.py @@ -22,6 +22,8 @@ def test_init(self, unit): assert not metric.predecessor_matrix assert metric.unit == unit assert metric.node_list is None + assert metric.high_bc_clustering is None + assert metric.high_bc_anisotropy is None @pytest.mark.parametrize( "unit,expected_symbol", @@ -93,6 +95,8 @@ def test_calculate_metrics( plt.close("all") for dist_matrix in part.metric.distance_matrix.values(): assert dist_matrix.shape == (part.graph.number_of_nodes(),) * 2 + assert 0.0 < part.metric.high_bc_clustering < 1.0 + assert 1.0 <= part.metric.high_bc_anisotropy @pytest.mark.parametrize("unit", ["time", "distance"]) def test_calculate_metrics_before(self, test_one_city_precalculated_copy, unit):