In [1]:
# --- Imports ---
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import umap
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Load dataset
df = pd.read_excel("sample_customer_database_5000_singapore.xlsx")
print("✅ Dataset loaded:", df.shape)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


✅ Dataset loaded: (5000, 9)


In [6]:
df['Date Joined'] = pd.to_datetime(df['Date Joined'], errors='coerce')

# Today's date
today = pd.to_datetime("today")

# Feature Engineering from Date_Joined
df['Days_Since_Joined'] = (today - df['Date Joined']).dt.days
df['Join_Year'] = df['Date Joined'].dt.year
df['Join_Month'] = df['Date Joined'].dt.month
df['Join_Quarter'] = df['Date Joined'].dt.quarter
df['Is_New'] = (df['Days_Since_Joined'] <= 180).astype(int)

In [3]:
#text preporcessing 
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load data
df = pd.read_excel("sample_customer_database_5000_singapore.xlsx")

# Clean Notes (for all methods)
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

df['Cleaned_Notes'] = df['Notes'].apply(preprocess_text)

# Save for reuse
categorical_cols = ['Location', 'Gender', 'Loyalty Tier']

#Preview
print(df[['Notes', 'Cleaned_Notes']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                               Notes  \
0                        Together range line beyond.   
1  Language ball floor meet usually board necessary.   
2                 Support time operation wear often.   
3                                  Stage plant view.   
4          Job article level others record hospital.   

                                      Cleaned_Notes  
0                        together range line beyond  
1  language ball floor meet usually board necessary  
2                 support time operation wear often  
3                                  stage plant view  
4          job article level others record hospital  


In [4]:
# Column Setup
categorical_cols = ['Location', 'Gender', 'Loyalty Tier', 'Join_Year', 'Join_Month', 'Join_Quarter', 'Is_New']
numerical_cols = ['Days_Since_Joined']

In [7]:
 #Model 1: Spectral Clustering (Text + Categorical)
# --- Text Embedding using Word2Vec ---
from gensim.models import Word2Vec
from sklearn.cluster import SpectralClustering

stop_words = set(stopwords.words('english'))
df['Tokens'] = df['Cleaned_Notes'].apply(lambda x: [t for t in word_tokenize(str(x)) if t.lower() not in stop_words])
w2v_model = Word2Vec(df['Tokens'], vector_size=150, window=5, min_count=1, workers=4)

def average_vector(tokens, model, size=150):
    valid = [t for t in tokens if t in model.wv]
    return np.mean([model.wv[t] for t in valid], axis=0) if valid else np.zeros(size)

df['Text_Embeddings'] = df['Tokens'].apply(lambda x: average_vector(x, w2v_model))
X_text = np.vstack(df['Text_Embeddings'])
X_umap = umap.UMAP(n_neighbors=30, min_dist=0.1, n_components=10).fit_transform(X_text)

df['Spectral_Text_Label'] = SpectralClustering(n_clusters=7, affinity='nearest_neighbors').fit_predict(X_umap)

# --- Metrics ---
print("✅ Model 1 (Text):")
print("Silhouette:", silhouette_score(X_umap, df['Spectral_Text_Label']))
print("Calinski-Harabasz:", calinski_harabasz_score(X_umap, df['Spectral_Text_Label']))
print("Davies-Bouldin:", davies_bouldin_score(X_umap, df['Spectral_Text_Label']))

# --- Categorical Spectral ---
from sklearn.metrics.pairwise import rbf_kernel
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(df[categorical_cols])
affinity_matrix = rbf_kernel(cat_encoded, gamma=0.5)
df['Spectral_Cat_Label'] = SpectralClustering(n_clusters=5, affinity='precomputed').fit_predict(affinity_matrix)

print("✅ Model 1 (Cat):")
print("Silhouette:", silhouette_score(cat_encoded, df['Spectral_Cat_Label']))
print("Calinski-Harabasz:", calinski_harabasz_score(cat_encoded, df['Spectral_Cat_Label']))
print("Davies-Bouldin:", davies_bouldin_score(cat_encoded, df['Spectral_Cat_Label']))




✅ Model 1 (Text):
Silhouette: 0.49039653
Calinski-Harabasz: 17360.67001811075
Davies-Bouldin: 0.500203221099842
✅ Model 1 (Cat):
Silhouette: 0.149256281535586
Calinski-Harabasz: 418.8640435916867
Davies-Bouldin: 2.2547869981359354


In [9]:
print("📊 Model 1 (Spectral) - Text: ", len(np.unique(df['Spectral_Text_Label'])))
print("📊 Model 1 (Spectral) - Categorical: ", len(np.unique(df['Spectral_Cat_Label'])))


📊 Model 1 (Spectral) - Text:  7
📊 Model 1 (Spectral) - Categorical:  5


In [10]:
import numpy as np
import time
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_score

# --- Helper for CV + Timing ---
def evaluate_with_timing(func, n_runs=5):
    sil_scores = []
    ch_scores = []
    db_scores = []
    durations = []
    for i in range(n_runs):
        start = time.time()
        sil, ch, db = func()
        end = time.time()
        sil_scores.append(sil)
        ch_scores.append(ch)
        db_scores.append(db)
        durations.append(end - start)
    return (
        np.mean(sil_scores),
        np.mean(ch_scores),
        np.mean(db_scores),
        np.mean(durations),
        np.std(sil_scores),
        np.std(ch_scores),
        np.std(db_scores)
    )

In [11]:
# --- Model 1: Spectral Clustering (Text) ---
def run_model1_spectral_text():
    labels = SpectralClustering(n_clusters=7, affinity='nearest_neighbors', random_state=42).fit_predict(X_umap)
    sil = silhouette_score(X_umap, labels)
    ch = calinski_harabasz_score(X_umap, labels)
    db = davies_bouldin_score(X_umap, labels)
    return sil, ch, db

# --- Model 1: Spectral Clustering (Categorical) ---
def run_model1_spectral_cat():
    labels = SpectralClustering(n_clusters=5, affinity='precomputed').fit_predict(affinity_matrix)
    sil = silhouette_score(cat_encoded, labels)
    ch = calinski_harabasz_score(cat_encoded, labels)
    db = davies_bouldin_score(cat_encoded, labels)
    return sil, ch, db

# Run CV for both
text_sil, text_ch, text_db, text_time, text_sil_std, text_ch_std, text_db_std = evaluate_with_timing(run_model1_spectral_text, n_runs=5)
cat_sil, cat_ch, cat_db, cat_time, cat_sil_std, cat_ch_std, cat_db_std = evaluate_with_timing(run_model1_spectral_cat, n_runs=5)

# Print results
print("✅ Model 1 - Text (Spectral)")
print(f"• Avg Silhouette Score: {text_sil:.4f} (±{text_sil_std:.4f})")
print(f"• Avg Calinski-Harabasz Score: {text_ch:.2f} (±{text_ch_std:.2f})")
print(f"• Avg Davies-Bouldin Score: {text_db:.4f} (±{text_db_std:.4f})")
print(f"• Avg Time: {text_time:.2f} seconds")

print("\n✅ Model 1 - Categorical (Spectral)")
print(f"• Avg Silhouette Score: {cat_sil:.4f} (±{cat_sil_std:.4f})")
print(f"• Avg Calinski-Harabasz Score: {cat_ch:.2f} (±{cat_ch_std:.2f})")
print(f"• Avg Davies-Bouldin Score: {cat_db:.4f} (±{cat_db_std:.4f})")
print(f"• Avg Time: {cat_time:.2f} seconds")


✅ Model 1 - Text (Spectral)
• Avg Silhouette Score: 0.4902 (±0.0000)
• Avg Calinski-Harabasz Score: 17328.68 (±0.00)
• Avg Davies-Bouldin Score: 0.5001 (±0.0000)
• Avg Time: 1.58 seconds

✅ Model 1 - Categorical (Spectral)
• Avg Silhouette Score: 0.1493 (±0.0000)
• Avg Calinski-Harabasz Score: 418.86 (±0.00)
• Avg Davies-Bouldin Score: 2.2548 (±0.0000)
• Avg Time: 3.70 seconds


In [12]:
text_sil1, text_ch1, text_db1, text_time1, text_sil_std1, text_ch_std1, text_db_std1 = evaluate_with_timing(run_model1_spectral_text, n_runs=5)
cat_sil1, cat_ch1, cat_db1, cat_time1, cat_sil_std1, cat_ch_std1, cat_db_std1 = evaluate_with_timing(run_model1_spectral_cat, n_runs=5)

text_results1 = (text_sil1, text_ch1, text_db1, text_time1)
cat_results1 = (cat_sil1, cat_ch1, cat_db1, cat_time1)


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
from hdbscan.validity import validity_index
import hdbscan
import umap
import numpy as np

# ----------- TEXT CLUSTERING ----------- #
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['Cleaned_Notes'].astype(str))
X_embed_text = umap.UMAP(n_neighbors=15, min_dist=0.1).fit_transform(X_tfidf)

text_clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
text_labels = text_clusterer.fit_predict(X_embed_text)
df['HDBSCAN_Text_Label'] = text_labels

# METRICS (TEXT)
mask_text = text_labels != -1
if len(np.unique(text_labels[mask_text])) > 1:
    X_embed_text_64 = X_embed_text.astype(np.float64)
    dbcv_text = validity_index(X_embed_text_64, text_labels)
    ch_text = calinski_harabasz_score(X_embed_text[mask_text], text_labels[mask_text])
    db_text = davies_bouldin_score(X_embed_text[mask_text], text_labels[mask_text])
    print("✅ Model 2 (Text - HDBSCAN):")
    print(f"• DBCV Score: {dbcv_text:.4f}")
    print(f"• Calinski-Harabasz Score: {ch_text:.2f}")
    print(f"• Davies-Bouldin Score: {db_text:.4f}")
else:
    print("⚠️ Not enough valid clusters in HDBSCAN Text for metrics.")


# ----------- CATEGORICAL CLUSTERING ----------- #
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(df[categorical_cols])
cat_embed = umap.UMAP(n_neighbors=15, min_dist=0.1).fit_transform(cat_encoded)

cat_clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
cat_labels = cat_clusterer.fit_predict(cat_embed)
df['HDBSCAN_Cat_Label'] = cat_labels

# METRICS (CATEGORICAL)
mask_cat = cat_labels != -1
if len(np.unique(cat_labels[mask_cat])) > 1:
    cat_embed_64 = cat_embed.astype(np.float64)
    dbcv_cat = validity_index(cat_embed_64, cat_labels)
    ch_cat = calinski_harabasz_score(cat_embed[mask_cat], cat_labels[mask_cat])
    db_cat = davies_bouldin_score(cat_embed[mask_cat], cat_labels[mask_cat])
    print("✅ Model 2 (Cat - HDBSCAN):")
    print(f"• DBCV Score: {dbcv_cat:.4f}")
    print(f"• Calinski-Harabasz Score: {ch_cat:.2f}")
    print(f"• Davies-Bouldin Score: {db_cat:.4f}")
else:
    print("⚠️ Not enough valid clusters in HDBSCAN Cat for metrics.")




✅ Model 2 (Text - HDBSCAN):
• DBCV Score: -0.6268
• Calinski-Harabasz Score: 61.28
• Davies-Bouldin Score: 0.5939




✅ Model 2 (Cat - HDBSCAN):
• DBCV Score: 0.9174
• Calinski-Harabasz Score: 2200417.56
• Davies-Bouldin Score: 0.1252


In [14]:
# Text
n_clusters_text = len(np.unique(df['HDBSCAN_Text_Label'])) - (1 if -1 in df['HDBSCAN_Text_Label'] else 0)
print("📊 Model 2 (HDBSCAN) - Text: ", n_clusters_text)

# Categorical
n_clusters_cat = len(np.unique(df['HDBSCAN_Cat_Label'])) - (1 if -1 in df['HDBSCAN_Cat_Label'] else 0)
print("📊 Model 2 (HDBSCAN) - Categorical: ", n_clusters_cat)


📊 Model 2 (HDBSCAN) - Text:  5
📊 Model 2 (HDBSCAN) - Categorical:  210


In [15]:
# --- Model 2: HDBSCAN (Text) ---
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['Cleaned_Notes'].astype(str))
X_embed_text = umap.UMAP(n_neighbors=15, min_dist=0.1).fit_transform(X_tfidf)

def run_model2_hdbscan_text():
    labels = hdbscan.HDBSCAN(min_cluster_size=10).fit_predict(X_embed_text)
    mask = labels != -1
    if len(np.unique(labels[mask])) > 1:
        dbcv = validity_index(X_embed_text.astype(np.float64), labels)
        ch = calinski_harabasz_score(X_embed_text[mask], labels[mask])
        db = davies_bouldin_score(X_embed_text[mask], labels[mask])
        return dbcv, ch, db
    return 0, 0, 0

# --- Model 2: HDBSCAN (Categorical) ---
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(df[['Location', 'Gender', 'Loyalty Tier', 'Join_Year', 'Join_Month', 'Join_Quarter', 'Is_New']])
cat_embed = umap.UMAP(n_neighbors=15, min_dist=0.1).fit_transform(cat_encoded)

def run_model2_hdbscan_cat():
    labels = hdbscan.HDBSCAN(min_cluster_size=10).fit_predict(cat_embed)
    mask = labels != -1
    if len(np.unique(labels[mask])) > 1:
        dbcv = validity_index(cat_embed.astype(np.float64), labels)
        ch = calinski_harabasz_score(cat_embed[mask], labels[mask])
        db = davies_bouldin_score(cat_embed[mask], labels[mask])
        return dbcv, ch, db
    return 0, 0, 0

# Run CV for Model 2
text_sil2, text_ch2, text_db2, text_time2, text_sil_std2, text_ch_std2, text_db_std2 = evaluate_with_timing(run_model2_hdbscan_text, n_runs=5)
cat_sil2, cat_ch2, cat_db2, cat_time2, cat_sil_std2, cat_ch_std2, cat_db_std2 = evaluate_with_timing(run_model2_hdbscan_cat, n_runs=5)

print("\n✅ Model 2 - Text (HDBSCAN)")
print(f"• Avg DBCV Score: {text_sil2:.4f} (±{text_sil_std2:.4f})")
print(f"• Avg Calinski-Harabasz Score: {text_ch2:.2f} (±{text_ch_std2:.2f})")
print(f"• Avg Davies-Bouldin Score: {text_db2:.4f} (±{text_db_std2:.4f})")
print(f"• Avg Time: {text_time2:.2f} seconds")

print("\n✅ Model 2 - Categorical (HDBSCAN)")
print(f"• Avg DBCV Score: {cat_sil2:.4f} (±{cat_sil_std2:.4f})")
print(f"• Avg Calinski-Harabasz Score: {cat_ch2:.2f} (±{cat_ch_std2:.2f})")
print(f"• Avg Davies-Bouldin Score: {cat_db2:.4f} (±{cat_db_std2:.4f})")
print(f"• Avg Time: {cat_time2:.2f} seconds")





✅ Model 2 - Text (HDBSCAN)
• Avg DBCV Score: -0.5680 (±0.0000)
• Avg Calinski-Harabasz Score: 58.97 (±0.00)
• Avg Davies-Bouldin Score: 0.6259 (±0.0000)
• Avg Time: 11.08 seconds

✅ Model 2 - Categorical (HDBSCAN)
• Avg DBCV Score: 0.9071 (±0.0000)
• Avg Calinski-Harabasz Score: 1698410.62 (±0.00)
• Avg Davies-Bouldin Score: 0.1189 (±0.0000)
• Avg Time: 9.59 seconds


In [16]:
# Get final labels for cluster count
hdbscan_labels_text = hdbscan.HDBSCAN(min_cluster_size=10).fit_predict(X_embed_text)
hdbscan_labels_cat = hdbscan.HDBSCAN(min_cluster_size=10).fit_predict(cat_embed)

# Count number of clusters (excluding noise label -1)
n_clusters_text2 = len(set(hdbscan_labels_text)) - (1 if -1 in hdbscan_labels_text else 0)
n_clusters_cat2 = len(set(hdbscan_labels_cat)) - (1 if -1 in hdbscan_labels_cat else 0)

print(f"Model 2 - Text: {n_clusters_text2} clusters")
print(f"Model 2 - Categorical: {n_clusters_cat2} clusters")




Model 2 - Text: 4 clusters
Model 2 - Categorical: 209 clusters




In [17]:
# Ensure this runs before Model 3
df['Date Joined'] = pd.to_datetime(df['Date Joined'])
df['Days_Since_Joined'] = (pd.Timestamp.today() - df['Date Joined']).dt.days


In [18]:
#model 3
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity

# --- BERT for text ---
bert = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeds = bert.encode(df['Cleaned_Notes'].astype(str).tolist())
sim_text = cosine_similarity(bert_embeds)

# --- Jaccard for categorical ---
inter = np.dot(cat_encoded, cat_encoded.T)
rowsum = cat_encoded.sum(axis=1)
union = rowsum[:, None] + rowsum - inter
sim_cat = inter / np.maximum(union, 1e-10)

# --- Combine Similarity Matrices ---
alpha = 0.5
S = alpha * sim_text + (1 - alpha) * sim_cat
dist = 1 - S

# --- Agglomerative Clustering ---
agglo = AgglomerativeClustering(n_clusters=5, metric='precomputed', linkage='average')
df['Hybrid_Agglo_Label'] = agglo.fit_predict(dist)

# --- Evaluation on Combined Encoded Features ---
scaled_num = StandardScaler().fit_transform(df[numerical_cols])
combined = np.hstack([cat_encoded, scaled_num])
print("✅ Model 3 (Hybrid):")
print("Silhouette:", silhouette_score(combined, df['Hybrid_Agglo_Label']))
print("Calinski-Harabasz:", calinski_harabasz_score(combined, df['Hybrid_Agglo_Label']))
print("Davies-Bouldin:", davies_bouldin_score(combined, df['Hybrid_Agglo_Label']))


✅ Model 3 (Hybrid):
Silhouette: 0.12741052792524693
Calinski-Harabasz: 420.8231533774878
Davies-Bouldin: 2.3096629716962687


In [19]:
print("📊 Model 3 (Hybrid Agglomerative): ", len(np.unique(df['Hybrid_Agglo_Label'])))


📊 Model 3 (Hybrid Agglomerative):  5


In [20]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import time

# Function to run model
def run_model3_agglo():
    labels = AgglomerativeClustering(n_clusters=5, metric='precomputed', linkage='average').fit_predict(dist)
    scaled_num = StandardScaler().fit_transform(df[numerical_cols])
    combined = np.hstack([cat_encoded, scaled_num])
    sil = silhouette_score(combined, labels)
    ch = calinski_harabasz_score(combined, labels)
    db = davies_bouldin_score(combined, labels)
    return sil, ch, db

# CV Evaluation function
def evaluate_with_timing(func, n_runs=5):
    sils, chs, dbs, times = [], [], [], []
    for _ in range(n_runs):
        start = time.time()
        sil, ch, db = func()
        end = time.time()
        sils.append(sil)
        chs.append(ch)
        dbs.append(db)
        times.append(end - start)
    return (
        np.mean(sils), np.mean(chs), np.mean(dbs),
        np.mean(times),
        np.std(sils), np.std(chs), np.std(dbs)
    )

# Run CV
sil3, ch3, db3, time3, sil3_std, ch3_std, db3_std = evaluate_with_timing(run_model3_agglo, n_runs=5)

# Print Results
print("\n✅ Model 3 - Hybrid (Agglomerative)")
print(f"• Avg Silhouette Score: {sil3:.4f} (±{sil3_std:.4f})")
print(f"• Avg Calinski-Harabasz Score: {ch3:.2f} (±{ch3_std:.2f})")
print(f"• Avg Davies-Bouldin Score: {db3:.4f} (±{db3_std:.4f})")
print(f"• Avg Time: {time3:.2f} seconds")



✅ Model 3 - Hybrid (Agglomerative)
• Avg Silhouette Score: 0.1274 (±0.0000)
• Avg Calinski-Harabasz Score: 420.82 (±0.00)
• Avg Davies-Bouldin Score: 2.3097 (±0.0000)
• Avg Time: 1.40 seconds


In [21]:
!pip install tabulate





[notice] A new release of pip is available: 23.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
from hdbscan.validity import validity_index
import pandas as pd
from tabulate import tabulate

# Step 1: Model results using your variable names
models_data = [
    ("Model 1 - Text (Spectral)", 7, text_sil, text_sil_std, text_ch, text_ch_std, text_db, text_db_std, text_time, "-"),
    ("Model 1 - Cat (Spectral)", 5, cat_sil, cat_sil_std, cat_ch, cat_ch_std, cat_db, cat_db_std, cat_time, "-"),
    ("Model 2 - Text (HDBSCAN)", len(np.unique(text_labels[text_labels != -1])), text_sil2, text_sil_std2, ch_text, 0, db_text, 0, text_time2, dbcv_text),
    ("Model 2 - Cat (HDBSCAN)", len(np.unique(cat_labels[cat_labels != -1])), cat_sil2, cat_sil_std2, ch_cat, 0, db_cat, 0, cat_time2, dbcv_cat),
    ("Model 3 - Hybrid (Agglomerative)", 5, sil3, sil3_std, ch3, ch3_std, db3, db3_std, time3, "-")
]

# Step 2: Create DataFrame
df_models = pd.DataFrame(models_data, columns=[
    "Model", "No. of Clusters",
    "Silhouette", "Sil Std",
    "Calinski-Harabasz", "CH Std",
    "Davies-Bouldin", "DB Std",
    "Time (s)", "DBCV Score"
])

# Step 3: Format for pretty display
df_models["Silhouette (±SD)"] = df_models.apply(lambda x: f"{x['Silhouette']:.4f} (±{x['Sil Std']:.4f})", axis=1)
df_models["CH Score (±SD)"] = df_models.apply(lambda x: f"{x['Calinski-Harabasz']:.2f} (±{x['CH Std']:.2f})", axis=1)
df_models["DB Score (±SD)"] = df_models.apply(lambda x: f"{x['Davies-Bouldin']:.4f} (±{x['DB Std']:.4f})", axis=1)
df_models["Time (s)"] = df_models["Time (s)"].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
df_models["DBCV Score"] = df_models["DBCV Score"].apply(lambda x: f"{x:.4f}" if isinstance(x, float) else x)

# Step 4: Final formatted display
final_table = df_models[[  
    "Model", "No. of Clusters",  
    "Silhouette (±SD)", "CH Score (±SD)", "DB Score (±SD)",  
    "DBCV Score", "Time (s)"  
]]

print("📊 Model Comparison Table:\n")
print(tabulate(final_table, headers='keys', tablefmt='pretty'))


📊 Model Comparison Table:

+---+----------------------------------+-----------------+-------------------+--------------------+------------------+------------+----------+
|   |              Model               | No. of Clusters | Silhouette (±SD)  |   CH Score (±SD)   |  DB Score (±SD)  | DBCV Score | Time (s) |
+---+----------------------------------+-----------------+-------------------+--------------------+------------------+------------+----------+
| 0 |    Model 1 - Text (Spectral)     |        7        | 0.4902 (±0.0000)  |  17328.68 (±0.00)  | 0.5001 (±0.0000) |     -      |   1.58   |
| 1 |     Model 1 - Cat (Spectral)     |        5        | 0.1493 (±0.0000)  |   418.86 (±0.00)   | 2.2548 (±0.0000) |     -      |   3.70   |
| 2 |     Model 2 - Text (HDBSCAN)     |        4        | -0.5680 (±0.0000) |   61.28 (±0.00)    | 0.5939 (±0.0000) |  -0.6268   |  11.08   |
| 3 |     Model 2 - Cat (HDBSCAN)      |       209       | 0.9071 (±0.0000)  | 2200417.56 (±0.00) | 0.1252 (±0.0000

In [23]:
import pandas as pd
from tabulate import tabulate

# Model results: (Model Name, No. of Clusters, Silhouette, Sil Std, CH, CH Std, DB, DB Std, Time)
models_data = [
    ("Model 1 - Text (Spectral)", 7, text_sil, text_sil_std, text_ch, text_ch_std, text_db, text_db_std, text_time),
    ("Model 1 - Cat (Spectral)", 5, cat_sil, cat_sil_std, cat_ch, cat_ch_std, cat_db, cat_db_std, cat_time),
    ("Model 2 - Text (HDBSCAN)", n_clusters_text2, text_sil2, text_sil_std2, text_ch2, text_ch_std2, text_db2, text_db_std2, text_time2),
    ("Model 2 - Cat (HDBSCAN)", n_clusters_cat2, cat_sil2, cat_sil_std2, cat_ch2, cat_ch_std2, cat_db2, cat_db_std2, cat_time2),
    ("Model 3 - Hybrid (Agglomerative)", 5, sil3, sil3_std, ch3, ch3_std, db3, db3_std, time3)
]

# Create DataFrame
df_models = pd.DataFrame(models_data, columns=[
    "Model", "No. of Clusters",
    "Silhouette", "Sil Std",
    "Calinski-Harabasz", "CH Std",
    "Davies-Bouldin", "DB Std",
    "Time (s)"
])

# Format for pretty display
df_models["Silhouette (±SD)"] = df_models.apply(lambda x: f"{x['Silhouette']:.4f} (±{x['Sil Std']:.4f})", axis=1)
df_models["CH Score (±SD)"] = df_models.apply(lambda x: f"{x['Calinski-Harabasz']:.2f} (±{x['CH Std']:.2f})", axis=1)
df_models["DB Score (±SD)"] = df_models.apply(lambda x: f"{x['Davies-Bouldin']:.4f} (±{x['DB Std']:.4f})", axis=1)
df_models["Time (s)"] = df_models["Time (s)"].apply(lambda x: f"{x:.2f}")

# Final formatted display
final_table = df_models[[
    "Model", "No. of Clusters",
    "Silhouette (±SD)", "CH Score (±SD)", "DB Score (±SD)", "Time (s)"
]]

# Display with tabulate
print("📊 Model Comparison Table:\n")
print(tabulate(final_table, headers='keys', tablefmt='pretty'))


📊 Model Comparison Table:

+---+----------------------------------+-----------------+-------------------+--------------------+------------------+----------+
|   |              Model               | No. of Clusters | Silhouette (±SD)  |   CH Score (±SD)   |  DB Score (±SD)  | Time (s) |
+---+----------------------------------+-----------------+-------------------+--------------------+------------------+----------+
| 0 |    Model 1 - Text (Spectral)     |        7        | 0.4902 (±0.0000)  |  17328.68 (±0.00)  | 0.5001 (±0.0000) |   1.58   |
| 1 |     Model 1 - Cat (Spectral)     |        5        | 0.1493 (±0.0000)  |   418.86 (±0.00)   | 2.2548 (±0.0000) |   3.70   |
| 2 |     Model 2 - Text (HDBSCAN)     |        4        | -0.5680 (±0.0000) |   58.97 (±0.00)    | 0.6259 (±0.0000) |  11.08   |
| 3 |     Model 2 - Cat (HDBSCAN)      |       209       | 0.9071 (±0.0000)  | 1698410.62 (±0.00) | 0.1189 (±0.0000) |   9.59   |
| 4 | Model 3 - Hybrid (Agglomerative) |        5        | 0.12