In [2]:
import pandas as pd

# Load the dataset
df = pd.read_excel('sample_customer_database_5000_singapore.xlsx')


In [3]:
# First look at the data
print(df.head())  # First 5 rows

  Customer ID        Full Name                Email Address  Phone Number  \
0       C0001     Norma Fisher          ysullivan@yahoo.com      82421948   
1       C0002      Levi Durham            qgrimes@gmail.com      97535139   
2       C0003   Kimberly Olsen  sean96@johnston-roberts.com      71122018   
3       C0004   Matthew Davies    nguyendarrell@hotmail.com      41352560   
4       C0005  Angela Martinez    myersmitchell@johnson.com        869141   

  Date Joined     Location  Gender Loyalty Tier  \
0  2023-08-11     Tampines  Female     Platinum   
1  2022-11-24      Geylang  Female     Platinum   
2  2023-06-19     Tampines  Female     Platinum   
3  2025-04-04   Ang Mo Kio    Male       Silver   
4  2025-01-15  Bukit Batok  Female     Platinum   

                                               Notes  
0                        Together range line beyond.  
1  Language ball floor meet usually board necessary.  
2                 Support time operation wear often.  
3         

In [4]:
import numpy
import pandas

print("NumPy version:", numpy.__version__)
print("Pandas version:", pandas.__version__)


NumPy version: 1.26.4
Pandas version: 2.2.3


In [5]:
# Shape of the dataset
print("Shape of dataset:", df.shape)

Shape of dataset: (5000, 9)


In [6]:
# Columns and Data types
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Customer ID    5000 non-null   object        
 1   Full Name      5000 non-null   object        
 2   Email Address  5000 non-null   object        
 3   Phone Number   5000 non-null   int64         
 4   Date Joined    5000 non-null   datetime64[ns]
 5   Location       5000 non-null   object        
 6   Gender         5000 non-null   object        
 7   Loyalty Tier   5000 non-null   object        
 8   Notes          5000 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 351.7+ KB
None


In [7]:
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

Missing values per column:
 Customer ID      0
Full Name        0
Email Address    0
Phone Number     0
Date Joined      0
Location         0
Gender           0
Loyalty Tier     0
Notes            0
dtype: int64


In [8]:
# Check for duplicate rows
print("Number of duplicate rows:", df.duplicated().sum())

Number of duplicate rows: 0


In [9]:
# Unique values per column
print("Unique values per column:\n", df.nunique())

Unique values per column:
 Customer ID      5000
Full Name        4835
Email Address    4983
Phone Number     4998
Date Joined      1084
Location           27
Gender              2
Loyalty Tier        3
Notes            5000
dtype: int64


In [10]:
#text preporcessing 
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load data
df = pd.read_excel("sample_customer_database_5000_singapore.xlsx")

# Clean Notes (for all methods)
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

df['Cleaned_Notes'] = df['Notes'].apply(preprocess_text)

# Save for reuse
categorical_cols = ['Location', 'Gender', 'Loyalty Tier']

#Preview
print(df[['Notes', 'Cleaned_Notes']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                               Notes  \
0                        Together range line beyond.   
1  Language ball floor meet usually board necessary.   
2                 Support time operation wear often.   
3                                  Stage plant view.   
4          Job article level others record hospital.   

                                      Cleaned_Notes  
0                        together range line beyond  
1  language ball floor meet usually board necessary  
2                 support time operation wear often  
3                                  stage plant view  
4          job article level others record hospital  


In [None]:
#model 1:Model 1: Spectral Clustering with Word2Vec for Text + One-Hot for Categorical (Separate)
# #text clustering
from gensim.models import Word2Vec
import numpy as np
from sklearn.cluster import SpectralClustering

df['Tokens'] = df['Cleaned_Notes'].apply(word_tokenize)
w2v_model = Word2Vec(df['Tokens'], vector_size=100, window=5, min_count=1, workers=4)

def average_vector(tokens, model, size=100):
    valid_tokens = [token for token in tokens if token in model.wv]
    if not valid_tokens:
        return np.zeros(size)
    return np.mean([model.wv[token] for token in valid_tokens], axis=0)

df['Text_Embeddings'] = df['Tokens'].apply(lambda x: average_vector(x, w2v_model))

X_text = np.vstack(df['Text_Embeddings'].values)
spectral = SpectralClustering(n_clusters=5, affinity='nearest_neighbors', random_state=42)
df['Text_Spectral_Label'] = spectral.fit_predict(X_text)


In [56]:
from gensim.models import Word2Vec
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import umap
import numpy as np

# Step 1: Token cleaning
stop_words = set(stopwords.words('english'))
df['Tokens'] = df['Cleaned_Notes'].apply(lambda x: [t for t in word_tokenize(x) if t not in stop_words])

# Step 2: Word2Vec (baseline)
w2v_model = Word2Vec(df['Tokens'], vector_size=150, window=5, min_count=1, workers=4)

def average_vector(tokens, model, size=150):
    valid_tokens = [token for token in tokens if token in model.wv]
    if not valid_tokens:
        return np.zeros(size)
    return np.mean([model.wv[token] for token in valid_tokens], axis=0)

df['Text_Embeddings'] = df['Tokens'].apply(lambda x: average_vector(x, w2v_model))
X_text = np.vstack(df['Text_Embeddings'].values)

# Step 3: UMAP
X_umap = umap.UMAP(n_neighbors=30, min_dist=0.1, n_components=10, random_state=42).fit_transform(X_text)

# Step 4: Spectral Clustering
spectral = SpectralClustering(n_clusters=7, affinity='nearest_neighbors', random_state=42)
df['Text_Spectral_Label'] = spectral.fit_predict(X_umap)

# Step 5: Evaluate
score = silhouette_score(X_umap, df['Text_Spectral_Label'])
print("✅ Model 1 - Spectral + UMAP (Improved)")
print("Silhouette Score:", score)


  warn(


✅ Model 1 - Spectral + UMAP (Improved)
Silhouette Score: 0.49111015


In [57]:
#attempt 3 for improbement 
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import umap
import numpy as np

# --- Step 1: Preprocess Tokens ---
stop_words = set(stopwords.words('english'))
df['Tokens'] = df['Cleaned_Notes'].apply(lambda x: [t.lower() for t in word_tokenize(x) if t.isalpha() and t.lower() not in stop_words])

# --- Step 2: Train Word2Vec Model ---
w2v_model = Word2Vec(sentences=df['Tokens'], vector_size=100, window=5, min_count=1, workers=4)

# --- Step 3: Compute TF-IDF Matrix ---
texts = df['Cleaned_Notes'].astype(str).tolist()
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
idf_dict = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))

# --- Step 4: Compute Weighted Average Embeddings ---
def weighted_avg_vector(tokens, model, idf_dict, size=100):
    vecs = []
    for token in tokens:
        if token in model.wv:
            weight = idf_dict.get(token, 1.0)
            vecs.append(model.wv[token] * weight)
    if not vecs:
        return np.zeros(size)
    return np.mean(vecs, axis=0)

df['Text_Embeddings'] = df['Tokens'].apply(lambda x: weighted_avg_vector(x, w2v_model, idf_dict))
X_text = np.vstack(df['Text_Embeddings'].values)

# --- Step 5: UMAP Dimensionality Reduction ---
X_umap = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=10, random_state=42).fit_transform(X_text)

# --- Step 6: Spectral Clustering ---
spectral = SpectralClustering(n_clusters=5, affinity='nearest_neighbors', random_state=42)
df['Text_Spectral_Label'] = spectral.fit_predict(X_umap)

# --- Step 7: Evaluation ---
score = silhouette_score(X_umap, df['Text_Spectral_Label'])
print("✅ Model 1 – Spectral with TF-IDF Weighted Word2Vec")
print("Silhouette Score:", score)


  warn(


✅ Model 1 – Spectral with TF-IDF Weighted Word2Vec
Silhouette Score: 0.419095


In [None]:
#improvised categorical clustering
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(df[categorical_cols])

from sklearn.metrics.pairwise import rbf_kernel
affinity_matrix = rbf_kernel(cat_encoded, gamma=0.5)
df['Cat_Spectral_Label'] = SpectralClustering(n_clusters=5, affinity='precomputed').fit_predict(affinity_matrix)


In [13]:
# Display Spectral Clustering results for TEXT
print("📊 Spectral Clustering (Text) — Cluster Counts")
print(df['Text_Spectral_Label'].value_counts())

print("\n📝 Sample Notes per Text Cluster")
for i in sorted(df['Text_Spectral_Label'].unique()):
    print(f"\n--- Cluster {i} ---")
    print(df[df['Text_Spectral_Label'] == i]['Cleaned_Notes'].head(3).to_string(index=False))

# Display Spectral Clustering results for CATEGORICAL
print("\n📊 Spectral Clustering (Categorical) — Cluster Counts")
print(df['Cat_Spectral_Label'].value_counts())

print("\n🏷️ Sample Records per Categorical Cluster")
for i in sorted(df['Cat_Spectral_Label'].unique()):
    print(f"\n--- Cluster {i} ---")
    print(df[df['Cat_Spectral_Label'] == i][['Location', 'Gender', 'Loyalty Tier']].head(3).to_string(index=False))


📊 Spectral Clustering (Text) — Cluster Counts
Text_Spectral_Label
1    2174
4    1526
0     910
2     254
3     136
Name: count, dtype: int64

📝 Sample Notes per Text Cluster

--- Cluster 0 ---
                                  stage plant view
response purpose character would partner hit an...
                        know series lay smile away

--- Cluster 1 ---
              together range line beyond
job article level others record hospital
                           part cup read

--- Cluster 2 ---
        movie end discussion budget situation run
                                  time firm water
recently prepare scene house central baby picture

--- Cluster 3 ---
suffer without rather
     pm election case
 rather spend similar

--- Cluster 4 ---
language ball floor meet usually board necessary
               support time operation wear often
                            animal exactly drive

📊 Spectral Clustering (Categorical) — Cluster Counts
Cat_Spectral_Label
4    2809
1    199

In [22]:
#attemp for improv model 1 
# Text Embeddings with BERT
from sentence_transformers import SentenceTransformer
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
X_text = bert_model.encode(df['Cleaned_Notes'].astype(str).tolist(), show_progress_bar=True)

# Spectral Clustering on BERT Embeddings
from sklearn.cluster import SpectralClustering
spectral_text = SpectralClustering(n_clusters=5, affinity='nearest_neighbors', random_state=42)
df['Text_Spectral_Label'] = spectral_text.fit_predict(X_text)

# One-Hot Encoding + PCA for Categorical
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(df[categorical_cols])

pca = PCA(n_components=10, random_state=42)
cat_pca = pca.fit_transform(cat_encoded)

# Spectral Clustering on Reduced Categorical Data
spectral_cat = SpectralClustering(n_clusters=5, affinity='nearest_neighbors', random_state=42)
df['Cat_Spectral_Label'] = spectral_cat.fit_predict(cat_pca)


Batches: 100%|██████████| 157/157 [00:06<00:00, 24.81it/s]


In [49]:
#Model 2: HDBSCAN with UMAP Embedding for Text + Cat (Separate)
from sklearn.feature_extraction.text import TfidfVectorizer
import umap
import hdbscan

# ---------- TEXT CLUSTERING ----------
# Step 1: TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))

tfidf_matrix = tfidf_vectorizer.fit_transform(df['Cleaned_Notes'].astype(str))

# Step 2: UMAP Dimensionality Reduction
text_embed = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42).fit_transform(tfidf_matrix)

# Step 3: HDBSCAN Clustering
df['HDBSCAN_Text_Label'] = hdbscan.HDBSCAN(min_cluster_size=15).fit_predict(text_embed)

# ---------- CATEGORICAL CLUSTERING ----------
# Step 1: UMAP on One-Hot Encoded Data
cat_embed = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42).fit_transform(cat_encoded)

# Step 2: HDBSCAN Clustering
df['HDBSCAN_Cat_Label'] = hdbscan.HDBSCAN(min_cluster_size=10).fit_predict(cat_embed)

# ---------- Preview ----------
print(df[['Cleaned_Notes', 'HDBSCAN_Text_Label', 'HDBSCAN_Cat_Label']].head())



  warn(
  warn(


                                      Cleaned_Notes  HDBSCAN_Text_Label  \
0                        together range line beyond                   1   
1  language ball floor meet usually board necessary                   1   
2                 support time operation wear often                   1   
3                                  stage plant view                   1   
4          job article level others record hospital                   1   

   HDBSCAN_Cat_Label  
0                147  
1                155  
2                147  
3                 17  
4                137  




In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
import umap
import hdbscan
import numpy as np

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Cleaned_Notes'].astype(str))

# Parameter grids
n_neighbors_list = [10, 15, 20]
min_dist_list = [0.05, 0.1, 0.2]
min_cluster_size_list = [8, 10, 12, 15]
min_samples_list = [1, 3, 5]

best_score = -1
best_params = None

print("🔍 Running auto-tuning for HDBSCAN text clustering...\n")

for n_neighbors in n_neighbors_list:
    for min_dist in min_dist_list:
        for min_cluster_size in min_cluster_size_list:
            for min_samples in min_samples_list:
                try:
                    # UMAP
                    text_embed = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=10, random_state=42).fit_transform(tfidf_matrix)

                    # HDBSCAN
                    labels = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples).fit_predict(text_embed)

                    # Skip if only one cluster or all noise
                    if len(set(labels)) <= 1 or np.sum(labels != -1) < 10:
                        continue

                    # Evaluate
                    score = silhouette_score(text_embed, labels)

                    # Track best
                    if score > best_score:
                        best_score = score
                        best_params = {
                            'n_neighbors': n_neighbors,
                            'min_dist': min_dist,
                            'min_cluster_size': min_cluster_size,
                            'min_samples': min_samples,
                            'silhouette_score': score
                        }

                        print(f"✅ New best: {best_params}")
                except Exception as e:
                    continue

print("\n🏁 Auto-tuning complete.")
print("Best config:", best_params)


🔍 Running auto-tuning for HDBSCAN text clustering...



  warn(


✅ New best: {'n_neighbors': 10, 'min_dist': 0.05, 'min_cluster_size': 8, 'min_samples': 1, 'silhouette_score': 0.3339218}


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(



🏁 Auto-tuning complete.
Best config: {'n_neighbors': 10, 'min_dist': 0.05, 'min_cluster_size': 8, 'min_samples': 1, 'silhouette_score': 0.3339218}


In [62]:
# Remove noise points (-1) before calculating metrics
text_mask = df['HDBSCAN_Text_Label'] != -1
cat_mask = df['HDBSCAN_Cat_Label'] != -1

print("Model 2 - Text (HDBSCAN)")
print("Silhouette:", silhouette_score(text_embed[text_mask], df.loc[text_mask, 'HDBSCAN_Text_Label']))

print("Model 2 - Cat (HDBSCAN)")
print("Silhouette:", silhouette_score(cat_embed[cat_mask], df.loc[cat_mask, 'HDBSCAN_Cat_Label']))


Model 2 - Text (HDBSCAN)
Silhouette: 0.015781965
Model 2 - Cat (HDBSCAN)
Silhouette: 0.9875996


In [63]:
#improvemetattemplt 
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan

# ========== TEXT CLUSTERING ==========
# Step 1: BERT Embeddings
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = bert_model.encode(df['Cleaned_Notes'].astype(str).tolist(), show_progress_bar=True)

# Step 2: UMAP on BERT Embeddings
text_umap = UMAP(n_neighbors=15, min_dist=0.1, n_components=10, random_state=42).fit_transform(bert_embeddings)

# Step 3: HDBSCAN Clustering (tuned)
hdbscan_text = hdbscan.HDBSCAN(min_cluster_size=8, min_samples=5)
df['HDBSCAN_Text_Label'] = hdbscan_text.fit_predict(text_umap)

# ========== CATEGORICAL CLUSTERING ==========
# Step 1: UMAP on One-Hot Encoded Categorical Data
cat_umap = UMAP(n_neighbors=15, min_dist=0.1, n_components=10, random_state=42).fit_transform(cat_encoded)

# Step 2: HDBSCAN Clustering (tuned)
hdbscan_cat = hdbscan.HDBSCAN(min_cluster_size=8, min_samples=5)
df['HDBSCAN_Cat_Label'] = hdbscan_cat.fit_predict(cat_umap)

# ========== Preview ==========
print(df[['Cleaned_Notes', 'HDBSCAN_Text_Label', 'HDBSCAN_Cat_Label']].head())


Batches: 100%|██████████| 157/157 [00:06<00:00, 24.46it/s]
  warn(
  warn(
[1.80615544e-13 4.38334655e-07 2.32858277e-07 5.76855268e-08
 1.33375463e-07 2.55785454e-07 4.69392355e-07 2.33287542e-06
 2.75068699e-06 3.96828707e-06 1.77789068e-06 1.42717889e-06]
not reaching the requested tolerance 1.6093254089355469e-06.
Use iteration 160 instead with accuracy 
7.767505721879352e-07.

  _, diffusion_map = lobpcg(
[1.69625601e-13 4.38587314e-07 2.33174881e-07 5.80628243e-08
 1.33687311e-07 2.57301163e-07 5.33090857e-07 1.68702430e-06
 1.24446951e-06 1.50467675e-06 1.00009090e-06 2.23097423e-06]
not reaching the requested tolerance 1.6093254089355469e-06.
  _, diffusion_map = lobpcg(


                                      Cleaned_Notes  HDBSCAN_Text_Label  \
0                        together range line beyond                 152   
1  language ball floor meet usually board necessary                  25   
2                 support time operation wear often                 132   
3                                  stage plant view                  -1   
4          job article level others record hospital                 109   

   HDBSCAN_Cat_Label  
0                107  
1                149  
2                107  
3                 74  
4                138  




In [38]:
#updated method 3:#model 3 :Agglomerative Clustering with Gower Distance on Combined Features (With BERT)
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# BERT Embeddings
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = bert_model.encode(df['Cleaned_Notes'].astype(str).tolist())

# Cosine Similarity from BERT
A = cosine_similarity(bert_embeddings)

# One-hot encode categorical data
categorical_cols = ['Location', 'Gender', 'Loyalty Tier']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(df[categorical_cols])

# Jaccard Similarity for categorical data
intersection = np.dot(cat_encoded, cat_encoded.T)
row_sums = cat_encoded.sum(axis=1)
union = row_sums[:, None] + row_sums - intersection
B = intersection / np.maximum(union, 1e-10)

# Combine similarities
alpha = 0.3
S = alpha * A + (1 - alpha) * B
distance_matrix = 1 - S

# Agglomerative Clustering
agglo = AgglomerativeClustering(n_clusters=5, metric='precomputed', linkage='average')
df['Hybrid_Agglo_Label'] = agglo.fit_predict(distance_matrix)

# View sample
print(df[['Location', 'Gender', 'Loyalty Tier', 'Cleaned_Notes', 'Hybrid_Agglo_Label']].head())


      Location  Gender Loyalty Tier  \
0     Tampines  Female     Platinum   
1      Geylang  Female     Platinum   
2     Tampines  Female     Platinum   
3   Ang Mo Kio    Male       Silver   
4  Bukit Batok  Female     Platinum   

                                      Cleaned_Notes  Hybrid_Agglo_Label  
0                        together range line beyond                   0  
1  language ball floor meet usually board necessary                   0  
2                 support time operation wear often                   0  
3                                  stage plant view                   3  
4          job article level others record hospital                   0  


In [58]:
#gower method  3
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import AgglomerativeClustering
import gower

# Step 1: Generate BERT embeddings for text
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = bert_model.encode(df['Cleaned_Notes'].astype(str).tolist())

# Step 2: One-hot encode categorical columns
categorical_cols = ['Location', 'Gender', 'Loyalty Tier']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(df[categorical_cols])

# Step 3: Combine BERT and categorical data
combined_features = np.hstack((cat_encoded, bert_embeddings))

# Step 4: Compute Gower distance on combined features
combined_df = pd.DataFrame(combined_features)
gower_dist = gower.gower_matrix(combined_df)

# Step 5: Perform Agglomerative Clustering
agglo = AgglomerativeClustering(n_clusters=5, metric='precomputed', linkage='average')
df['Gower_Agglo_Label'] = agglo.fit_predict(gower_dist)

# Step 6: Preview clusters
print(df[['Location', 'Gender', 'Loyalty Tier', 'Cleaned_Notes', 'Gower_Agglo_Label']].head())


      Location  Gender Loyalty Tier  \
0     Tampines  Female     Platinum   
1      Geylang  Female     Platinum   
2     Tampines  Female     Platinum   
3   Ang Mo Kio    Male       Silver   
4  Bukit Batok  Female     Platinum   

                                      Cleaned_Notes  Gower_Agglo_Label  
0                        together range line beyond                  0  
1  language ball floor meet usually board necessary                  0  
2                 support time operation wear often                  0  
3                                  stage plant view                  0  
4          job article level others record hospital                  0  


In [26]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Text
print("Model 1 - Text (Spectral Clustering)")
print("Silhouette:", silhouette_score(X_text, df['Text_Spectral_Label']))
print("Calinski-Harabasz:", calinski_harabasz_score(X_text, df['Text_Spectral_Label']))
print("Davies-Bouldin:", davies_bouldin_score(X_text, df['Text_Spectral_Label']))

# Categorical
print("Model 1 - Cat (Spectral Clustering)")
print("Silhouette:", silhouette_score(cat_encoded, df['Cat_Spectral_Label']))
print("Calinski-Harabasz:", calinski_harabasz_score(cat_encoded, df['Cat_Spectral_Label']))
print("Davies-Bouldin:", davies_bouldin_score(cat_encoded, df['Cat_Spectral_Label']))


Model 1 - Text (Spectral Clustering)
Silhouette: 0.060273815
Calinski-Harabasz: 1409.43931092896
Davies-Bouldin: 10.273685066261894
Model 1 - Cat (Spectral Clustering)
Silhouette: 0.2794308491074161
Calinski-Harabasz: 1201.2297248557607
Davies-Bouldin: 1.3863497788194767


In [23]:
# ✅ Use PCA-transformed data for categorical evaluation
print("Model 1 - Cat (Spectral Clustering)")
print("Silhouette:", silhouette_score(cat_pca, df['Cat_Spectral_Label']))
print("Calinski-Harabasz:", calinski_harabasz_score(cat_pca, df['Cat_Spectral_Label']))
print("Davies-Bouldin:", davies_bouldin_score(cat_pca, df['Cat_Spectral_Label']))


Model 1 - Cat (Spectral Clustering)
Silhouette: -0.1505217461303243
Calinski-Harabasz: 34.23525918003809
Davies-Bouldin: 7.503501089416867


In [None]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Text
print("Model 1 - Text (Spectral Clustering)")
print("Silhouette:", silhouette_score(X_text, df['Text_Spectral_Label']))
print("Calinski-Harabasz:", calinski_harabasz_score(X_text, df['Text_Spectral_Label']))
print("Davies-Bouldin:", davies_bouldin_score(X_text, df['Text_Spectral_Label']))

# Categorical
print("Model 1 - Cat (Spectral Clustering)")
print("Silhouette:", silhouette_score(cat_encoded, df['Cat_Spectral_Label']))
print("Calinski-Harabasz:", calinski_harabasz_score(cat_encoded, df['Cat_Spectral_Label']))
print("Davies-Bouldin:", davies_bouldin_score(cat_encoded, df['Cat_Spectral_Label']))


Model 1 - Text (Spectral Clustering)
Silhouette: -0.0119842235
Calinski-Harabasz: 11.892784821413603
Davies-Bouldin: 2.75172344657728
Model 1 - Cat (Spectral Clustering)
Silhouette: -0.05330253070615256
Calinski-Harabasz: 35.36139140225387
Davies-Bouldin: 7.107905010122954


In [64]:
# Remove noise points (-1) before calculating metrics
text_mask = df['HDBSCAN_Text_Label'] != -1
cat_mask = df['HDBSCAN_Cat_Label'] != -1

print("Model 2 - Text (HDBSCAN)")
print("Silhouette:", silhouette_score(text_embed[text_mask], df.loc[text_mask, 'HDBSCAN_Text_Label']))

print("Model 2 - Cat (HDBSCAN)")
print("Silhouette:", silhouette_score(cat_embed[cat_mask], df.loc[cat_mask, 'HDBSCAN_Cat_Label']))


Model 2 - Text (HDBSCAN)
Silhouette: -0.1938553
Model 2 - Cat (HDBSCAN)
Silhouette: 0.9837479


In [59]:
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score

# Reuse combined features (used to create similarity matrix)
# This includes BERT embeddings + one-hot categorical encoding
combined_features = np.hstack((cat_encoded, bert_embeddings))

# Get labels from Model 3
labels = df['Hybrid_Agglo_Label']

# Calinski-Harabasz Score (higher is better)
ch_score = calinski_harabasz_score(combined_features, labels)

# Davies-Bouldin Score (lower is better)
db_score = davies_bouldin_score(combined_features, labels)

print("Model 3 - Calinski-Harabasz Score:", ch_score)
print("Model 3 - Davies-Bouldin Score:", db_score)


Model 3 - Calinski-Harabasz Score: 622.1816412837292
Model 3 - Davies-Bouldin Score: 2.125900885767681


In [60]:
#UPDATED EVAL
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import AgglomerativeClustering
import gower
import numpy as np
import pandas as pd

# BERT Embeddings
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = bert_model.encode(df['Cleaned_Notes'].astype(str).tolist())

# One-hot encode categorical data
categorical_cols = ['Location', 'Gender', 'Loyalty Tier']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(df[categorical_cols])

# Combine features
combined_features = np.hstack((cat_encoded, bert_embeddings))

# Gower distance matrix
gower_dist = gower.gower_matrix(pd.DataFrame(combined_features))

# Agglomerative Clustering
agglo = AgglomerativeClustering(n_clusters=5, metric='precomputed', linkage='average')
labels = agglo.fit_predict(gower_dist)

# Evaluate on combined feature input
ch_score = calinski_harabasz_score(combined_features, labels)
db_score = davies_bouldin_score(combined_features, labels)

print("Model 3 - Calinski-Harabasz Score:", ch_score)
print("Model 3 - Davies-Bouldin Score:", db_score)


Model 3 - Calinski-Harabasz Score: 3.25451362325673
Model 3 - Davies-Bouldin Score: 5.999095970478388
