In [1]:
# pip install --upgrade scikit-learn

In [2]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, Birch
from sklearn.metrics import jaccard_score, adjusted_rand_score, calinski_harabasz_score, silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
print('Import successful')

Import successful


In [3]:
# Load data
df = pd.read_csv('Mall_Customers.csv')
# Fixing the 'Gender' column name
df.rename(columns={'Genre': 'Gender'}, inplace=True)

In [4]:
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [5]:
# Change from Male and Female to 0-1 
df.Gender = [1 if each == "Female" else 0 for each in df.Gender ]

In [6]:
# Extract features for clustering (ignoring 'CustomerID' and 'Gender' for now)
features = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

In [7]:
# Standardize the data
# A preprocessing step that transforms the features of a dataset to have a mean of 0 and a standard deviation of 1. 
# This is important in machine learning, especially for algorithms that rely on distance metrics or gradient-based optimization.
# It ensures that all features contribute equally to the analysis. It is particularly useful when features have different units or scales. 
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

In [8]:
# Apply PCA for dimensionality reduction
# PCA is a dimensionality reduction technique that aims to transform the original features of a dataset into a new set of 
# uncorrelated features, called principal components. These principal components capture the maximum variance in the data.
# PCA is often used for reducing the number of features in a dataset while retaining as much of the original variability as possible. 
# Especially useful when dealing with high-dimensional data or when there is multicollinearity among features.
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(scaled_data)
# 'n_components' specifies the number of principal components to retain. 

In [9]:
# Apply clustering algorithms
kmeans = KMeans(n_clusters=2, random_state=42)
agglomerative = AgglomerativeClustering(n_clusters=2)
dbscan = DBSCAN(eps=0.5, min_samples=5)
birch = Birch(n_clusters=2)

In [10]:
# Fit the models
kmeans_labels = kmeans.fit_predict(scaled_data)
agglomerative_labels = agglomerative.fit_predict(scaled_data)
dbscan_labels = dbscan.fit_predict(scaled_data)
birch_labels = birch.fit_predict(scaled_data)

  super()._check_params_vs_input(X, default_n_init=10)


In [11]:
# Evaluate the clustering results using metrics
silhouette = [silhouette_score(scaled_data, predicted_labels) for predicted_labels in [kmeans_labels, agglomerative_labels, dbscan_labels, birch_labels]]
davies_bouldin = [davies_bouldin_score(scaled_data, predicted_labels) for predicted_labels in [kmeans_labels, agglomerative_labels, dbscan_labels, birch_labels]]

In [13]:
# Print or store the results
print("Clustering Methods    : kmeans_labels, agglomerative_lables, dbscan_labels, birch_labels")
print("Silhouette Scores     :", silhouette)
print("Davies Bouldin Indices:", davies_bouldin)

Clustering Methods    : kmeans_labels, agglomerative_lables, dbscan_labels, birch_labels
Silhouette Scores     : [0.25418996301027574, 0.2511952763244898, -0.128701372094438, 0.1940733413172533]
Davies Bouldin Indices: [1.5412318706224073, 1.5635131897716499, 1.5094575126273528, 1.8460238296285925]


In [None]:
# Silhouette scores 
# It indicate the overall quality of clustering, with higher values being desirable.
# A silhouette score ranges from -1 to 1.

In [15]:
# Davies Bouldin Index
# Davies Bouldin Index assesses the compactness and separation of clusters, with lower values being preferable.
# A lower score indicates better clustering.