In [1]:
import pandas as pd

# Load the dataset
df = pd.read_excel('sample_customer_database_5000_singapore.xlsx')


In [2]:
# First look at the data
print(df.head())  # First 5 rows

  Customer ID        Full Name                Email Address  Phone Number  \
0       C0001     Norma Fisher          ysullivan@yahoo.com      82421948   
1       C0002      Levi Durham            qgrimes@gmail.com      97535139   
2       C0003   Kimberly Olsen  sean96@johnston-roberts.com      71122018   
3       C0004   Matthew Davies    nguyendarrell@hotmail.com      41352560   
4       C0005  Angela Martinez    myersmitchell@johnson.com        869141   

  Date Joined     Location  Gender Loyalty Tier  \
0  2023-08-11     Tampines  Female     Platinum   
1  2022-11-24      Geylang  Female     Platinum   
2  2023-06-19     Tampines  Female     Platinum   
3  2025-04-04   Ang Mo Kio    Male       Silver   
4  2025-01-15  Bukit Batok  Female     Platinum   

                                               Notes  
0                        Together range line beyond.  
1  Language ball floor meet usually board necessary.  
2                 Support time operation wear often.  
3         

In [3]:
import numpy
import pandas

print("NumPy version:", numpy.__version__)
print("Pandas version:", pandas.__version__)


NumPy version: 1.26.4
Pandas version: 2.2.3


In [4]:
# Shape of the dataset
print("Shape of dataset:", df.shape)

Shape of dataset: (5000, 9)


In [5]:
# Columns and Data types
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Customer ID    5000 non-null   object        
 1   Full Name      5000 non-null   object        
 2   Email Address  5000 non-null   object        
 3   Phone Number   5000 non-null   int64         
 4   Date Joined    5000 non-null   datetime64[ns]
 5   Location       5000 non-null   object        
 6   Gender         5000 non-null   object        
 7   Loyalty Tier   5000 non-null   object        
 8   Notes          5000 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 351.7+ KB
None


In [6]:
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

Missing values per column:
 Customer ID      0
Full Name        0
Email Address    0
Phone Number     0
Date Joined      0
Location         0
Gender           0
Loyalty Tier     0
Notes            0
dtype: int64


In [7]:
# Check for duplicate rows
print("Number of duplicate rows:", df.duplicated().sum())

Number of duplicate rows: 0


In [8]:
# Unique values per column
print("Unique values per column:\n", df.nunique())

Unique values per column:
 Customer ID      5000
Full Name        4835
Email Address    4983
Phone Number     4998
Date Joined      1084
Location           27
Gender              2
Loyalty Tier        3
Notes            5000
dtype: int64


In [9]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

tokens = word_tokenize("This is a test sentence.")
print(tokens)


['This', 'is', 'a', 'test', 'sentence', '.']


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [10]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# Define stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Join back into string
    return ' '.join(filtered_tokens)

# Apply to Notes column
df['Cleaned_Notes'] = df['Notes'].astype(str).apply(preprocess_text)

# Preview
print(df[['Notes', 'Cleaned_Notes']].head())


                                               Notes  \
0                        Together range line beyond.   
1  Language ball floor meet usually board necessary.   
2                 Support time operation wear often.   
3                                  Stage plant view.   
4          Job article level others record hospital.   

                                      Cleaned_Notes  
0                        together range line beyond  
1  language ball floor meet usually board necessary  
2                 support time operation wear often  
3                                  stage plant view  
4          job article level others record hospital  


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer()

# Fit and transform the cleaned notes
tfidf_matrix = tfidf.fit_transform(df['Cleaned_Notes'])

# Check shape (rows = customers, columns = tokens)
print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (5000, 869)


In [13]:
import pandas as pd

# Get feature names (words)
feature_names = tfidf.get_feature_names_out()

# Convert the TF-IDF matrix to a dense DataFrame for inspection
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Preview the first few rows
print(tfidf_df.head())


   ability  able  accept  according  account  across  act  action  activity  \
0      0.0   0.0     0.0        0.0      0.0     0.0  0.0     0.0       0.0   
1      0.0   0.0     0.0        0.0      0.0     0.0  0.0     0.0       0.0   
2      0.0   0.0     0.0        0.0      0.0     0.0  0.0     0.0       0.0   
3      0.0   0.0     0.0        0.0      0.0     0.0  0.0     0.0       0.0   
4      0.0   0.0     0.0        0.0      0.0     0.0  0.0     0.0       0.0   

   actually  ...  would  write  writer  wrong  yard  yeah  year  yes  yet  \
0       0.0  ...    0.0    0.0     0.0    0.0   0.0   0.0   0.0  0.0  0.0   
1       0.0  ...    0.0    0.0     0.0    0.0   0.0   0.0   0.0  0.0  0.0   
2       0.0  ...    0.0    0.0     0.0    0.0   0.0   0.0   0.0  0.0  0.0   
3       0.0  ...    0.0    0.0     0.0    0.0   0.0   0.0   0.0  0.0  0.0   
4       0.0  ...    0.0    0.0     0.0    0.0   0.0   0.0   0.0  0.0  0.0   

   young  
0    0.0  
1    0.0  
2    0.0  
3    0.0  
4    0.

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import SpectralClustering
import pandas as pd

# Step 1: TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Cleaned_Notes'])

# Step 2: Spectral Clustering
num_clusters = 5  # Adjust as needed
spectral = SpectralClustering(
    n_clusters=num_clusters,
    affinity='nearest_neighbors',  # or 'rbf' if dense similarity matrix preferred
    assign_labels='kmeans',
    random_state=42
)

labels = spectral.fit_predict(tfidf_matrix.toarray())  # Convert sparse matrix to dense

# Step 3: Add cluster labels to DataFrame
df['Text_Cluster_Label'] = labels

# Step 4: Preview Results
print(df[['Cleaned_Notes', 'Text_Cluster_Label']].head())


                                      Cleaned_Notes  Text_Cluster_Label
0                        together range line beyond                   0
1  language ball floor meet usually board necessary                   3
2                 support time operation wear often                   4
3                                  stage plant view                   4
4          job article level others record hospital                   4


In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import SpectralClustering

# Select categorical columns
categorical_cols = ['Location', 'Gender', 'Loyalty Tier']

# Use OneHotEncoder without the sparse argument
encoder = OneHotEncoder(handle_unknown='ignore')
cat_encoded_sparse = encoder.fit_transform(df[categorical_cols])

# Convert to dense only if needed
cat_encoded = cat_encoded_sparse.toarray() if hasattr(cat_encoded_sparse, "toarray") else cat_encoded_sparse

# Apply Spectral Clustering
spectral = SpectralClustering(
    n_clusters=5,
    affinity='nearest_neighbors',
    assign_labels='kmeans',
    random_state=42
)

labels = spectral.fit_predict(cat_encoded)

# Save labels
df['Categorical_Cluster_Label'] = labels

# View result
print(df[['Location', 'Gender', 'Loyalty Tier', 'Categorical_Cluster_Label']].head())


      Location  Gender Loyalty Tier  Categorical_Cluster_Label
0     Tampines  Female     Platinum                          4
1      Geylang  Female     Platinum                          4
2     Tampines  Female     Platinum                          4
3   Ang Mo Kio    Male       Silver                          4
4  Bukit Batok  Female     Platinum                          4




from sklearn.cluster import SpectralClustering
from sklearn.feature_extraction.text import TfidfVectorizer

# Filter out rows with empty Cleaned_Notes again to be safe
df = df[df['Cleaned_Notes'].str.strip() != '']

# Refit TF-IDF just in case rows were dropped
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Cleaned_Notes'])

spectral = SpectralClustering(
    n_clusters=5,
    affinity='nearest_neighbors',
    assign_labels='kmeans',
    random_state=42
)
labels = spectral.fit_predict(tfidf_matrix.toarray())

# Assign cluster labels to the dataframe
df['Text_Cluster_Label'] = labels

# Preview the clustering output
print(df[['Cleaned_Notes', 'Text_Cluster_Label']].head())


In [16]:
import hdbscan
import numpy


In [17]:
import hdbscan
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Initialize HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean')

# Fit to TF-IDF matrix
labels = clusterer.fit_predict(tfidf_matrix)

# Add the cluster labels to your dataframe
df['HDBSCAN_Text_Label'] = labels
# Check results
print(df[['Cleaned_Notes', 'HDBSCAN_Text_Label']].head(10))




                                      Cleaned_Notes  HDBSCAN_Text_Label
0                        together range line beyond                  -1
1  language ball floor meet usually board necessary                  -1
2                 support time operation wear often                  -1
3                                  stage plant view                  -1
4          job article level others record hospital                  -1
5                              animal exactly drive                   0
6               sign remember close ask reduce land                  -1
7                                     part cup read                  -1
8      republican total policy head mrs debate onto                  -1
9           rock structure federal board night loss                  -1


In [18]:
from sklearn.preprocessing import OneHotEncoder

# Select the categorical columns
categorical_cols = ['Location', 'Gender', 'Loyalty Tier']

# Apply One-Hot Encoding (sparse_output=True for HDBSCAN compatibility)
encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
categorical_matrix = encoder.fit_transform(df[categorical_cols])

import hdbscan

# Initialize and fit HDBSCAN
hdbscan_cat = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean')
cat_labels = hdbscan_cat.fit_predict(categorical_matrix)

# Add cluster labels to the DataFrame
df['HDBSCAN_Categorical_Label'] = cat_labels

print(df[['Location', 'Gender', 'Loyalty Tier', 'HDBSCAN_Categorical_Label']].head(10))




      Location  Gender Loyalty Tier  HDBSCAN_Categorical_Label
0     Tampines  Female     Platinum                         23
1      Geylang  Female     Platinum                         24
2     Tampines  Female     Platinum                         23
3   Ang Mo Kio    Male       Silver                         29
4  Bukit Batok  Female     Platinum                         25
5     Tampines    Male       Silver                          6
6  Bukit Batok  Female       Silver                         26
7        Bedok  Female       Silver                          4
8    Woodlands  Female       Silver                          5
9    Pasir Ris    Male       Silver                         55


In [24]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import AgglomerativeClustering
import gower
import numpy as np



# Step 2: Encode categorical columns
categorical_cols = ['Location', 'Gender', 'Loyalty Tier']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(df_sample[categorical_cols])

# Step 3: TF-IDF for text
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_sample['Cleaned_Notes'].astype(str))

# Step 4: Reduce TF-IDF dimensions
svd = TruncatedSVD(n_components=10, random_state=42)
text_reduced = svd.fit_transform(tfidf_matrix)

# Step 5: Combine categorical and text features
combined_features = np.hstack([cat_encoded, text_reduced])

# Step 6: Compute Gower distance
gower_dist = gower.gower_matrix(pd.DataFrame(combined_features))

# Step 7: Agglomerative clustering
agglo = AgglomerativeClustering(n_clusters=5, linkage='average', metric='precomputed')
labels = agglo.fit_predict(gower_dist)

# Step 8: Add to DataFrame
df_sample['Gower_Agglo_Label'] = labels

# Step 9: Preview
print(df_sample[['Location', 'Gender', 'Loyalty Tier', 'Cleaned_Notes', 'Gower_Agglo_Label']].head())


        Location Gender Loyalty Tier                          Cleaned_Notes  \
0      Serangoon   Male       Silver                            outside low   
1        Kallang   Male         Gold  morning region industry term director   
2  Choa Chu Kang   Male       Silver  degree executive attention argue hold   
3     Queenstown   Male         Gold                    standard race least   
4   Central Area   Male     Platinum                     put bag seven stay   

   Gower_Agglo_Label  
0                  1  
1                  2  
2                  1  
3                  2  
4                  4  
