# Clustering a UMAP Reduced Sample with DBSCAN or HDBSCAN with Scoring

In [None]:
import numpy as np
import pandas as pd
import umap
from sklearn.cluster import DBSCAN
import hdbscan
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Load the dataset
data = pd.read_csv('data.csv')

# Standardize the features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Apply UMAP for dimensionality reduction
umap_reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
umap_embedding = umap_reducer.fit_transform(data_scaled)

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(umap_embedding)

# Apply HDBSCAN clustering
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
hdbscan_labels = hdbscan_clusterer.fit_predict(umap_embedding)

# Calculate clustering scores
dbscan_silhouette = silhouette_score(umap_embedding, dbscan_labels)
dbscan_davies_bouldin = davies_bouldin_score(umap_embedding, dbscan_labels)

hdbscan_silhouette = silhouette_score(umap_embedding, hdbscan_labels)
hdbscan_davies_bouldin = davies_bouldin_score(umap_embedding, hdbscan_labels)

# Print the scores
print(f'DBSCAN Silhouette Score: {dbscan_silhouette}')
print(f'DBSCAN Davies-Bouldin Score: {dbscan_davies_bouldin}')
print(f'HDBSCAN Silhouette Score: {hdbscan_silhouette}')
print(f'HDBSCAN Davies-Bouldin Score: {hdbscan_davies_bouldin}')

# Plot the results
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.scatter(umap_embedding[:, 0], umap_embedding[:, 1], c=dbscan_labels, cmap='viridis')
plt.title('DBSCAN Clustering')
plt.subplot(1, 2, 2)
plt.scatter(umap_embedding[:, 0], umap_embedding[:, 1], c=hdbscan_labels, cmap='viridis')
plt.title('HDBSCAN Clustering')
plt.show()
