# Clustering a t-SNE Reduced Sample with DBSCAN or HDBSCAN with Scoring

This notebook demonstrates the use of DBSCAN and HDBSCAN clustering on a t-SNE reduced sample and evaluates the clustering performance using various scoring metrics.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
import hdbscan
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import matplotlib.pyplot as plt
%matplotlib inline

## Load the Data

Load the dataset to be reduced and clustered.

In [None]:
# Load the data
data = pd.read_csv('data.csv')

# Display the first few rows of the data
data.head()

## Preprocess the Data

Standardize the features before applying t-SNE.

In [None]:
# Standardize the features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

## Apply t-SNE

Reduce the dimensionality of the data using t-SNE.

In [None]:
# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_data = tsne.fit_transform(data_scaled)

# Convert to DataFrame
tsne_df = pd.DataFrame(tsne_data, columns=['t-SNE1', 't-SNE2'])
tsne_df.head()

## Apply DBSCAN

Cluster the t-SNE reduced data using DBSCAN.

In [None]:
# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(tsne_df)

# Add the cluster labels to the t-SNE data
tsne_df['DBSCAN_Cluster'] = dbscan_labels

## Apply HDBSCAN

Cluster the t-SNE reduced data using HDBSCAN.

In [None]:
# Apply HDBSCAN
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5)
hdbscan_labels = hdbscan_clusterer.fit_predict(tsne_df)

# Add the cluster labels to the t-SNE data
tsne_df['HDBSCAN_Cluster'] = hdbscan_labels

## Evaluate Clustering Performance

Evaluate the clustering performance using Silhouette Score, Davies-Bouldin Score, and Calinski-Harabasz Score.

In [None]:
# Evaluate DBSCAN clustering performance
dbscan_silhouette = silhouette_score(tsne_df[['t-SNE1', 't-SNE2']], dbscan_labels)
dbscan_davies_bouldin = davies_bouldin_score(tsne_df[['t-SNE1', 't-SNE2']], dbscan_labels)
dbscan_calinski_harabasz = calinski_harabasz_score(tsne_df[['t-SNE1', 't-SNE2']], dbscan_labels)

print(f'DBSCAN Silhouette Score: {dbscan_silhouette}')
print(f'DBSCAN Davies-Bouldin Score: {dbscan_davies_bouldin}')
print(f'DBSCAN Calinski-Harabasz Score: {dbscan_calinski_harabasz}')

In [None]:
# Evaluate HDBSCAN clustering performance
hdbscan_silhouette = silhouette_score(tsne_df[['t-SNE1', 't-SNE2']], hdbscan_labels)
hdbscan_davies_bouldin = davies_bouldin_score(tsne_df[['t-SNE1', 't-SNE2']], hdbscan_labels)
hdbscan_calinski_harabasz = calinski_harabasz_score(tsne_df[['t-SNE1', 't-SNE2']], hdbscan_labels)

print(f'HDBSCAN Silhouette Score: {hdbscan_silhouette}')
print(f'HDBSCAN Davies-Bouldin Score: {hdbscan_davies_bouldin}')
print(f'HDBSCAN Calinski-Harabasz Score: {hdbscan_calinski_harabasz}')

## Visualize the Clustering Results

Visualize the clustering results for both DBSCAN and HDBSCAN.

In [None]:
# Visualize DBSCAN clustering results
plt.figure(figsize=(10, 6))
plt.scatter(tsne_df['t-SNE1'], tsne_df['t-SNE2'], c=tsne_df['DBSCAN_Cluster'], cmap='viridis')
plt.xlabel('t-SNE1')
plt.ylabel('t-SNE2')
plt.title('DBSCAN Clustering on t-SNE Reduced Data')
plt.show()

In [None]:
# Visualize HDBSCAN clustering results
plt.figure(figsize=(10, 6))
plt.scatter(tsne_df['t-SNE1'], tsne_df['t-SNE2'], c=tsne_df['HDBSCAN_Cluster'], cmap='plasma')
plt.xlabel('t-SNE1')
plt.ylabel('t-SNE2')
plt.title('HDBSCAN Clustering on t-SNE Reduced Data')
plt.show()