In [1]:
# Load & Inspect Data
import pandas as pd

df = pd.read_csv("../data/Mall_Customers.csv")
print(df.head())
print(df.info())
print(df.describe())



   CustomerID  Gender  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB
None
       CustomerID         Age  Annual Income (k$)  

Step 2: Preprocess

Encode Gender → numeric.

Standardize features → clustering is distance-based.

In [None]:
## since the data has no missing values, we can proceed to preprocessing
# Preprocess Data
from sklearn.preprocessing import LabelEncoder, StandardScaler

df["Gender"] = LabelEncoder().fit_transform(df["Gender"])  # Male=1, Female=0

X = df.drop("CustomerID", axis=1)  # keep only useful cols

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [20]:
# Step 3: Decide the Number of Clusters
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import numpy as np

results = []
for k in range(2, 11):
    km = KMeans(n_clusters=k, random_state=42, n_init=10).fit(X_scaled)
    labels = km.labels_
    sil = silhouette_score(X_scaled, labels)
    ch  = calinski_harabasz_score(X_scaled, labels)
    db  = davies_bouldin_score(X_scaled, labels)
    results.append([k, sil, ch, db])

results_df = pd.DataFrame(results, columns=["k","silhouette","calinski_harabasz","davies_bouldin"])
print(results_df)




    k  silhouette  calinski_harabasz  davies_bouldin
0   2    0.251815          71.020516        1.613751
1   3    0.259513          66.772770        1.357408
2   4    0.298397          69.125800        1.280654
3   5    0.304060          68.964568        1.167230
4   6    0.331074          73.496235        1.017667
5   7    0.357377          76.778271        0.980158
6   8    0.387993          82.422775        0.944530
7   9    0.403092          85.746762        0.874952
8  10    0.420764          89.978196        0.833103




In [21]:
# Pick the k with high silhouette, high Calinski, low Davies–Bouldin.
k = 8

In [22]:
# Step 4: Check Stability

from sklearn.metrics import adjusted_rand_score
import itertools

def stability_at_k(X, k, seeds=[1,7,42,99,123]):
    labels_list = []
    for rs in seeds:
        km = KMeans(n_clusters=k, n_init=10, random_state=rs).fit(X)
        labels_list.append(km.labels_)
    pairs = itertools.combinations(range(len(labels_list)), 2)
    aris = [adjusted_rand_score(labels_list[i], labels_list[j]) for i,j in pairs]
    return np.mean(aris), np.std(aris)

mean_ari, std_ari = stability_at_k(X_scaled, k=8)
print("Stability ARI:", mean_ari, "±", std_ari)

Stability ARI: 0.988156870330436 ± 0.007427759220200706




In [23]:
# Step 5: Fit Final Model
kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
df["Cluster"] = kmeans.fit_predict(X_scaled)



In [33]:
# Step 6: see cluster distribution not just means
summary = df.groupby("Cluster").agg({
    "Gender": ["mean","count"],
    "Age": ["mean","median","std"],
    "Annual Income (k$)": ["mean","median","std"],
    "Spending Score (1-100)": ["mean","median","std"]
})
summary


Unnamed: 0_level_0,Gender,Gender,Age,Age,Age,Annual Income (k$),Annual Income (k$),Annual Income (k$),Spending Score (1-100),Spending Score (1-100),Spending Score (1-100)
Unnamed: 0_level_1,mean,count,mean,median,std,mean,median,std,mean,median,std
Cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
0,1.0,18,33.277778,32.0,4.389902,87.111111,82.5,18.929867,82.666667,85.5,11.103788
1,0.0,37,50.918919,49.0,8.917456,47.189189,48.0,13.844048,40.756757,45.0,15.400948
2,1.0,19,38.473684,40.0,12.001706,85.894737,81.0,16.110365,14.210526,13.0,9.801241
3,1.0,24,25.25,24.0,6.860409,41.25,44.0,17.192643,60.916667,57.5,15.838634
4,0.0,34,26.0,24.0,5.342795,39.529412,39.5,17.052922,59.5,54.5,20.136278
5,0.0,22,32.545455,32.0,3.432718,85.272727,78.5,14.286796,80.590909,81.0,9.158182
6,1.0,27,58.037037,59.0,8.942042,47.62963,49.0,15.279821,38.851852,46.0,18.129245
7,0.0,19,41.210526,41.0,9.925159,87.894737,79.0,17.045347,24.578947,24.0,10.631796


In [None]:
# Step 7: extract prototypes
# Find closest sample to each centroid (in scaled space)

dists = kmeans.transform(X_scaled)
prototype_idx = dists.argmin(axis=0)  # one per cluster
prototypes = df.iloc[prototype_idx]
prototypes


Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100),Cluster
179,180,1,35,93,90,0
54,55,0,50,43,45,1
170,171,1,40,87,13,2
68,69,1,19,48,59,3
45,46,0,24,39,65,4
175,176,0,30,88,86,5
74,75,1,59,54,47,6
168,169,0,36,87,27,7


In [37]:
# Step 7: Name Clusters
Q = df[["Annual Income (k$)", "Spending Score (1-100)"]].quantile([0.25,0.75])

def label_cluster(row):
    inc, spend = row["Annual Income (k$)"], row["Spending Score (1-100)"]
    if inc >= Q["Annual Income (k$)"][0.75] and spend >= Q["Spending Score (1-100)"][0.75]:
        return "High-Value Shoppers"
    if inc >= Q["Annual Income (k$)"][0.75] and spend <= Q["Spending Score (1-100)"][0.25]:
        return "Under-engaged High-Income"
    if inc <= Q["Annual Income (k$)"][0.25] and spend >= Q["Spending Score (1-100)"][0.75]:
        return "Value Seekers"
    return "General Segment"

df["Cluster_Label"] = df.apply(label_cluster, axis=1)
df


Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100),Cluster,Cluster_Label
0,1,1,19,15,39,3,General Segment
1,2,1,21,15,81,3,Value Seekers
2,3,0,20,16,6,4,General Segment
3,4,0,23,16,77,4,Value Seekers
4,5,0,31,17,40,4,General Segment
...,...,...,...,...,...,...,...
195,196,0,35,120,79,5,High-Value Shoppers
196,197,0,45,126,28,7,Under-engaged High-Income
197,198,1,32,126,74,0,High-Value Shoppers
198,199,1,32,137,18,2,Under-engaged High-Income


In [38]:
# export results
df.to_csv("clustered_customers.csv", index=False)