# HDBSCAN Clustering on RS and Using Centroid, Apply to WS

This notebook demonstrates the use of HDBSCAN clustering on a reference sample (RS) and then applies the centroid of the clusters to a working sample (WS).

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import hdbscan
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline

## Load the Data

Load the reference sample (RS) and working sample (WS) data.

In [None]:
# Load the data
RS_data = pd.read_csv('RS_data.csv')
WS_data = pd.read_csv('WS_data.csv')

# Display the first few rows of the data
RS_data.head(), WS_data.head()

## Preprocess the Data

Standardize the features before applying HDBSCAN.

In [None]:
# Standardize the features
scaler = StandardScaler()
RS_scaled = scaler.fit_transform(RS_data)
WS_scaled = scaler.transform(WS_data)

## Apply HDBSCAN on RS

Cluster the reference sample using HDBSCAN.

In [None]:
# Apply HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=5)
RS_labels = clusterer.fit_predict(RS_scaled)

# Add the cluster labels to the RS data
RS_data['Cluster'] = RS_labels

## Calculate Centroids

Calculate the centroids of the clusters in the reference sample.

In [None]:
# Calculate centroids
centroids = RS_data.groupby('Cluster').mean()
centroids = centroids.drop('Cluster', axis=1)
centroids

## Apply Centroids to WS

Assign each point in the working sample to the nearest centroid.

In [None]:
# Assign each point in WS to the nearest centroid
def assign_to_centroid(point, centroids):
    distances = np.linalg.norm(centroids - point, axis=1)
    return np.argmin(distances)

WS_data['Assigned Cluster'] = WS_scaled.apply(lambda x: assign_to_centroid(x, centroids.values), axis=1)
WS_data.head()

## Visualize the Results

Visualize the clustering results.

In [None]:
# Visualize the clustering results
plt.figure(figsize=(10, 6))
plt.scatter(RS_data.iloc[:, 0], RS_data.iloc[:, 1], c=RS_labels, cmap='viridis', label='RS Clusters')
plt.scatter(WS_data.iloc[:, 0], WS_data.iloc[:, 1], c=WS_data['Assigned Cluster'], cmap='plasma', marker='x', label='WS Assigned Clusters')
plt.legend()
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('HDBSCAN Clustering on RS and Applying Centroids to WS')
plt.show()