In [None]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import hdbscan


# Load data
gdf = gpd.read_file('../data/chipotle_clean.csv')
coords = gdf[['x','y']].values


# KMeans
print('Running KMeans...')
km = KMeans(n_clusters=10, random_state=42).fit(coords)
gdf['kmeans_10'] = km.labels_


fig, ax = plt.subplots(figsize=(10,6))
gdf.plot(ax=ax, column='kmeans_10', categorical=True, markersize=6, legend=False)
ax.set_title('KMeans (k=10)')
fig.savefig('../outputs/figures/kmeans_10_map.png', dpi=150, bbox_inches='tight')
plt.show()


# DBSCAN
print('Running DBSCAN...')
db = DBSCAN(eps=20000, min_samples=5).fit(coords)
gdf['dbscan_20k_5'] = db.labels_


fig, ax = plt.subplots(figsize=(10,6))
gdf.plot(ax=ax, column='dbscan_20k_5', categorical=True, markersize=6, legend=False)
ax.set_title('DBSCAN (eps=20km, min_samples=5)')
fig.savefig('../outputs/figures/dbscan_20k_5_map.png', dpi=150, bbox_inches='tight')
plt.show()


# HDBSCAN
print('Running HDBSCAN...')
hdb = hdbscan.HDBSCAN(min_cluster_size=10)
labels = hdb.fit_predict(coords)
gdf['hdbscan_10'] = labels


fig, ax = plt.subplots(figsize=(10,6))
gdf.plot(ax=ax, column='hdbscan_10', categorical=True, markersize=6, legend=False)
ax.set_title('HDBSCAN (min_cluster_size=10)')
fig.savefig('../outputs/figures/hdbscan_10_map.png', dpi=150, bbox_inches='tight')
plt.show()


# Metrics
def evaluate(coords, labels):
unique = set(labels)
n_clusters = len([x for x in unique if x != -1])
res = {'n_clusters': n_clusters, 'noise_rate': (labels == -1).sum() / len(labels)}
if n_clusters > 1:
res['silhouette'] = silhouette_score(coords, labels)
res['davies'] = davies_bouldin_score(coords, labels)
res['calinski'] = calinski_harabasz_score(coords, labels)
else:
res['silhouette'] = res['davies'] = res['calinski'] = None
return res


metrics = [
{'method': 'kmeans', **evaluate(coords, gdf['kmeans_10'])},
{'method': 'dbscan', **evaluate(coords, gdf['dbscan_20k_5'])},
{'method': 'hdbscan', **evaluate(coords, gdf['hdbscan_10'])}
]


metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv('../outputs/clustering_metrics.csv', index=False)
metrics_df