# 🧠 Symptom Clustering with UMAP + HDBSCAN
Welcome to the Lifespan Health AI Workshop notebook!

In this notebook, you'll explore synthetic Long Covid symptom data using unsupervised machine learning.
We'll use UMAP for dimensionality reduction and HDBSCAN for clustering.

In [None]:
import pandas as pd
import umap
import hdbscan
import matplotlib.pyplot as plt
import seaborn as sns

# For consistent plots
sns.set(style='whitegrid')

## 📥 Step 1: Load the dataset

In [None]:
# Load the dataset (change the file name if needed)
df = pd.read_csv('synthetic_dataset_1.csv')

# Optional: view first few rows
df.head()

## 🔧 Step 2: Prepare the data for clustering

In [None]:
# Drop non-numeric columns (PatientID, TrueCluster)
X = df.drop(columns=['PatientID', 'TrueCluster'])

## 📍 Step 3: Run UMAP for dimensionality reduction

In [None]:
reducer = umap.UMAP(random_state=42)
embedding = reducer.fit_transform(X)

## 📍 Step 4: Apply HDBSCAN for clustering

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=20)
labels = clusterer.fit_predict(embedding)

## 📊 Step 5: Visualise the clusters

In [None]:
# Create DataFrame with results
results = pd.DataFrame(embedding, columns=['UMAP1', 'UMAP2'])
results['Cluster'] = labels

# Plot
plt.figure(figsize=(10, 8))
palette = sns.color_palette('Spectral', n_colors=len(set(labels)) - (1 if -1 in labels else 0))
sns.scatterplot(data=results, x='UMAP1', y='UMAP2', hue='Cluster', palette=palette, s=40)
plt.title('Symptom Clusters Identified by HDBSCAN')
plt.legend(title='Cluster')
plt.show()

## 🧪 Step 6 (Optional): Compare with true cluster labels

In [None]:
results['TrueCluster'] = df['TrueCluster']
sns.countplot(data=results, x='TrueCluster', hue='Cluster')
plt.title('True Cluster vs. HDBSCAN Labels')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()