In [None]:
import googlemaps
import pandas as pd
import random
import json
from dotenv import load_dotenv
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
import numpy as np
from scipy.spatial import ConvexHull
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import pdist, squareform
from kneed import KneeLocator
import time
import os

In [None]:
def calculate_eps(coordinates, min_samples=2):
    """
    Automatically calculates the eps parameter for DBSCAN based on the nearest neighbors.

    Args:
    coordinates: List of (latitude, longitude) tuples for the cities.
    min_samples: The minimum samples in a neighborhood for a point to be considered as a core point.

    Returns:
    The calculated eps value.
    """
    # Use NearestNeighbors to find the distance to the nearest min_samples points
    nn = NearestNeighbors(n_neighbors=min_samples)
    nn.fit(coordinates)
    distances, indices = nn.kneighbors(coordinates)

    # Take the distance to the farthest of the min_samples points
    distances = np.sort(distances[:, min_samples - 1], axis=0)

    # Find the "knee" in the distances graph which is a good estimate for eps
    knee_locator = KneeLocator(
        range(len(distances)), distances, curve="convex", direction="increasing"
    )
    eps = distances[knee_locator.knee] if knee_locator.knee else np.mean(distances)

    return eps

In [None]:
def calculate_perimeter_and_area(points):
    """
    Calculate the perimeter and the area of the convex hull of a set of points.

    Args:
    points: An array of points in the format [(x1, y1), (x2, y2), ...]

    Returns:
    The perimeter of the convex hull and the area of the given points.
    """
    if len(points) < 3:
        # Not enough points to form a convex hull; return 0
        return np.nan, np.nan  # Use NaN to indicate the value is not available

    # Ensure all points do not lie on a single line or are not identical
    if np.std(points[:, 0]) == 0 or np.std(points[:, 1]) == 0:
        return np.nan, np.nan  # Points are collinear or identical in one dimension

    try:
        hull = ConvexHull(points)
        perimeter = hull.area
        area = hull.volume
        return perimeter, area
    except Exception as e:
        print("Failed to compute ConvexHull:", e)
        return np.nan, np.nan

In [None]:
def dbscan_and_metrics(coordinates):
    """
    Performs DBSCAN clustering on the provided coordinates and calculates various metrics for each cluster.

    This function automatically calculates the 'eps' parameter for DBSCAN using the nearest neighbors approach,
    performs the clustering, and then calculates metrics such as silhouette score, cluster sizes, inter-cluster distances,
    and various geometric properties of the clusters like perimeter and area.

    Args:
        coordinates (dict): A dictionary with city names as keys and (latitude, longitude) tuples as values.

    Returns:
        dict: A dictionary containing:
            - 'DBSCAN_min_sample': the min_sample value chosen for DBSCAN,
            - 'DBSCAN_eps': the epsilon value calculated for DBSCAN,
            - 'cluster_IDs': An array of cluster labels for each point.
            - 'cluster_sizes': A list with the size (number of points) of each cluster.
            - 'n_clusters': The number of clusters found, excluding noise.
            - 'avg_inter_cluster_distance_km': The average distance between clusters in kilometers.
            - 'min_inter_cluster_distance_km': The minimum distance between any two clusters in kilometers.
            - 'max_inter_cluster_distance_km': The maximum distance between any two clusters in kilometers.
            - 'average_silhouette': The average silhouette score across all clusters.
            - 'n_noise_points': The number of points classified as noise.
            - 'std_dev_cluster_sizes': The standard deviation of the sizes of the clusters.
            - 'average_cluster_density': The average density of clusters, defined as size/area.
            - 'average_cluster_perimeter': The average perimeter of the clusters.
            - 'average_cluster_area': The average area of the clusters.
            - 'average_cluster_complexity': An average measure of cluster complexity, defined as perimeter/sqrt(area).
            - 'eps_exec_time': The execution time for calculating the epsilon value
    """
    if not coordinates:
        print("No coordinates provided for clustering.")
        return {}

    # Convert city coordinates to a NumPy array for DBSCAN
    X = np.array(list(coordinates.values()))
    if X.size == 0:
        print("Empty coordinate array.")
        return {}

    min_sample = 4  # Based on https://www.theaidream.com/post/dbscan-clustering-algorithm-in-machine-learning 2.dim
    start_time_eps = time.time()
    # Calculate eps automatically
    eps = calculate_eps(X, min_samples=min_sample)
    execution_time_eps = time.time() - start_time_eps

    # Perform DBSCAN clustering
    db = DBSCAN(eps=eps, min_samples=min_sample, metric="euclidean").fit(X)
    labels = db.labels_

    # Number of clusters, excluding noise if present
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)

    # Calculate silhouette score
    if n_clusters_ == 0:
        print("No clusters found.")
        return {
            "DBSCAN_min_sample": min_sample,
            "DBSCAN_eps": eps,
            "cluster_IDs": [],
            "cluster_sizes": [],
            "n_clusters": 0,
            "n_noise_points": n_noise_,
            "avg_inter_cluster_distance_km": np.nan,
            "min_inter_cluster_distance_km": np.nan,
            "max_inter_cluster_distance_km": np.nan,
            "average_silhouette": np.nan,
            "std_dev_cluster_sizes": std_dev_cluster_sizes,
            "avg_cluster_density": np.nan,
            "avg_cluster_perimeter": np.nan,
            "avg_cluster_area": np.nan,
            "avg_cluster_complexity": np.nan,
            "eps_exec_time": execution_time_eps,
        }
    elif n_clusters_ > 1:
        silhouette_avg = silhouette_score(X, labels)
    else:
        silhouette_avg = (
            np.nan
        )  # silhouette score is not meaningful with 1 or 0 clusters

    # Prepare cluster information
    clusters = [X[labels == i] for i in range(n_clusters_)]
    cluster_sizes = [len(cluster) for cluster in clusters]
    std_dev_cluster_sizes = np.std(cluster_sizes) if cluster_sizes else 0

    # Calculate inter-cluster distances
    cluster_centers = [np.mean(cluster, axis=0) for cluster in clusters]
    if len(cluster_centers) > 1:
        inter_cluster_distances = (
            pdist(cluster_centers, "euclidean") * 111
        )  # Approx. conversion from degrees to km
        avg_inter_cluster_distance = np.mean(inter_cluster_distances)
        min_inter_cluster_distance = np.min(inter_cluster_distances)
        max_inter_cluster_distance = np.max(inter_cluster_distances)
    else:
        avg_inter_cluster_distance = min_inter_cluster_distance = (
            max_inter_cluster_distance
        ) = np.nan

    cluster_perimeters, cluster_areas = zip(
        *[calculate_perimeter_and_area(cluster) for cluster in clusters]
    )

    return {
        "DBSCAN_min_sample": min_sample,
        "DBSCAN_eps": eps,
        "cluster_IDs": labels,
        "cluster_sizes": cluster_sizes,
        "n_clusters": n_clusters_,
        "n_noise_points": n_noise_,
        "avg_inter_cluster_distance_km": avg_inter_cluster_distance,
        "min_inter_cluster_distance_km": min_inter_cluster_distance,
        "max_inter_cluster_distance_km": max_inter_cluster_distance,
        "avg_silhouette": silhouette_avg,
        "std_dev_cluster_sizes": std_dev_cluster_sizes,
        "avg_cluster_density": np.mean(
            [
                size / area if area else 0
                for size, area in zip(cluster_sizes, cluster_areas)
            ]
        ),
        "avg_cluster_perimeter": (
            np.mean(cluster_perimeters) if cluster_perimeters else np.nan
        ),
        "avg_cluster_area": np.mean(cluster_areas) if cluster_areas else np.nan,
        "avg_cluster_complexity": np.mean(
            [
                perimeter / np.sqrt(area) if area else 0
                for perimeter, area in zip(cluster_perimeters, cluster_areas)
            ]
        ),
        "eps_exec_time": execution_time_eps,
    }