In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import missingno as mno
sns.set_style("darkgrid", {"grid.color": ".2", "grid.linestyle": ":"})

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#reading data
df = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')

### Rudimentary inspection of dataset.

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
#statistical summary of all the numerical variables in dataset
df.describe()

In [None]:
#columns names
df.columns = ['CustomerID', 'Gender', 'Age', 'Annual Income', 'Spending Score']

In [None]:
#checking missing values
total = df.isnull().sum().sort_values(ascending = False)
percent = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

## Exploratory Data Analysis

In [None]:
#Gender
df.Gender.value_counts()

In [None]:
#company's campaingn ids' composition
ge = df['Gender'].value_counts()
ge_df = pd.DataFrame({'labels': ge.index,'values': ge.values})
px.pie(ge_df, names ='labels', values='values', title='Gender Composition', hole = 0.35)

In [None]:
#Distribution of numerical variables
num_cols = [col for col in df.select_dtypes(include = np.number).columns if col != 'CustomerID']
plt.figure(figsize = [18, 16])
with plt.style.context('seaborn-whitegrid'):
    sns.pairplot(df[num_cols], corner = True, plot_kws = {'alpha': 0.5})

In [None]:
#Distribution of numerical variables wrt gender
cols = [col for col in df.columns if col != 'CustomerID']
plt.figure(figsize = [18, 16])
with plt.style.context('seaborn-whitegrid'):
    sns.pairplot(df[cols], hue = 'Gender' ,corner = True, plot_kws = {'alpha': 0.5}, palette = 'Set1')

In [None]:
len(num_cols)

In [None]:
plt.figure(figsize = [20, 10])
for i in range(len(num_cols)):
    plt.subplot(1,len(num_cols), i + 1)
    with plt.style.context('ggplot'):
        ax = sns.boxenplot(data = df, y = num_cols[i], x = 'Gender')
        ax.patch.set_edgecolor('black')
        ax.patch.set_linewidth(1.5)
        ax.set_ylabel('')
        ax.set_title('{}'.format(num_cols[i]))

In [None]:
L = [0, 0.2, 0., 0.75, 0.95, 1]
df['Annual Income'].quantile(L)

In [None]:
#binning age and income into groups
df['age_group'] = pd.cut(df.Age, [0,25,35,45,55,65,200], 
                            labels = ['<25','25-35','35-45','45-55','55-65','>65'])
df['income_group'] = pd.cut(df['Annual Income'], [0,30,50,75,100,200], 
                            labels = ['low','low-medium','medium','high-medium','high'])
df.head()

In [None]:
gag = df.groupby(['Gender', 'age_group']).CustomerID.count()
gag = gag.reset_index()

fig = px.bar(gag, x='age_group', y='CustomerID', color='Gender',
             labels={'age_group':'Age Group', 'CustomerID':'Count'}, title = 'Composition of Age-groups', 
             category_orders = {'age_group': ['<25','25-35','35-45','45-55','55-65','>65']}, width = 1000)
fig.show()

In [None]:
#Aveerage spending score of age-groups
gag = df.groupby(['Gender', 'age_group'])['Spending Score'].mean()
gag = gag.reset_index()
fig = px.bar(gag, x='age_group', y='Spending Score', color='Gender', barmode = 'group',
             labels={'age_group':'Age Group', 'Spending Score':'Average'}, title = 'Average Spending Score', 
             category_orders = {'age_group': ['<25','25-35','35-45','45-55','55-65','>65']}, color_discrete_sequence = ['darkcyan', 'darkseagreen'], width = 1200)
fig.show()

In [None]:
iag = df.groupby(['Gender', 'income_group']).CustomerID.count()
iag = iag.reset_index()

fig = px.bar(iag, x='income_group', y='CustomerID', color='Gender',
             labels={'income_group':'Income Group', 'CustomerID':'Count'}, title = 'Composition of Age-groups', 
             category_orders = {'age_group': ['low','low-medium','medium','high-medium','high']}, width = 1000)
fig.show()

In [None]:
#Aveerage spending score of income-groups
iag = df.groupby(['Gender', 'income_group'])['Spending Score'].mean()
iag = iag.reset_index()
fig = px.bar(iag, x='income_group', y='Spending Score', color='Gender', barmode = 'group',
             labels={'income_group':'Income Group', 'Spending Score':'Average'}, title = 'Average Spending Score',
             category_orders = {'age_group': ['low','low-medium','medium','high-medium','high']}, color_discrete_sequence = ['darkcyan', 'darkseagreen'], width = 1200)
fig.show()

## Clustering

### Preprocessing

In [None]:
#numerical columns
cols = [col for col in df.select_dtypes(include = np.number).columns if col != 'CustomerID']
cols

In [None]:
df_num = df[cols]
df_num.head()

In [None]:
#rescaling
scaler = StandardScaler()
df_num = scaler.fit_transform(df_num)

df_num = pd.DataFrame(df_num)
df_num.columns = cols

In [None]:
#giving addition wweights to variables, Annual Income and Spending Score as there were natural clusters in data with these variables
#these variables appear to be more important than age for the given problem
#weights, 2:2:1
df_num['Annual Income'] = df_num['Annual Income']*2
df_num['Spending Score'] = df_num['Spending Score']*2
df_num.head()

In [None]:
df_num.agg([np.mean, np.std]).round(2)

### Clustering Tendency of data

In [None]:
#Function for Hopkin's statistic 
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
from math import isnan
 
def hhopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
##Other method to compute hopkin's statistic
#from pyclustertend import hopkins ## the hopkins test
#1 - hopkins(df_num[['Annual Income', 'Spending Score']].values, df_num[['Annual Income', 'Spending Score']].shape[0])

In [None]:
hhopkins(df_num)

## K-Means

In [None]:
# elbow-curve
ssd = []
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(df_num)
    
    ssd.append(kmeans.inertia_)
    
# plot the SSDs for each n_clusters
# ssd
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize = [12, 6])
    sns.lineplot(np.arange(2,9,1), ssd, marker='o', markersize = 10)
    plt.xlabel('number of clusters')
    plt.axvline(4, ls="--", c="red")
    plt.axvline(5, ls="--", c="red")
    plt.ylabel('ssd')

In [None]:
# silhouette analysis
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
sil = []
for num_clusters in range_n_clusters:
    
    # intialise kmeans
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(df_num)
    
    cluster_labels = kmeans.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(df_num, cluster_labels)
    sil.append(silhouette_avg)
    print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize = [12, 6])
    sns.lineplot(np.arange(2,9,1), sil, marker='o', markersize = 10)
    plt.xlabel('number of clusters')
    plt.axvline(5, ls="--", c="red")
    plt.axvline(6, ls="--", c="red")
    plt.ylabel('sil_score')

Considering both elbow curve and silhoutte analysis, let's choose k = 5 and see if the clusters make sense.

In [None]:
#model with k=5
num_cols = ['Age', 'Annual Income', 'Spending Score']
kmeans = KMeans(n_clusters = 5, init='k-means++', max_iter = 500)
kmeans.fit(df_num[num_cols])

In [None]:
kmeans.labels_

In [None]:
df_clusters = df[num_cols].copy()

In [None]:
#assign the label
df_clusters['kmeans_id'] = kmeans.labels_
df_num['kmeans_id'] = kmeans.labels_

df_clusters.head()

### Interpreting the Clusters

In [None]:
def box_plot(x, y):
    ax = sns.boxplot(x = x, y = y, data = df_clusters)
    ax.set_ylabel(y, fontsize = 14)
    ax.set_xlabel('Cluster ID', fontsize = 14)
    ax.set_title('{} distribution of clusters'.format(y), fontsize = 16, fontweight = 'bold')
    ax.patch.set_edgecolor('black')
    ax.patch.set_linewidth(1.5)
    plt.show()

In [None]:
#Age
plt.figure(figsize = [12,8])
box_plot('kmeans_id', 'Age')

In [None]:
#Annual Income
plt.figure(figsize = [12,8])
box_plot('kmeans_id', 'Annual Income')

In [None]:
#Spending score
plt.figure(figsize = [12,8])
box_plot('kmeans_id', 'Spending Score')

In [None]:
plt.figure(figsize = [18, 8])

plt.subplot(121)
ax1 = sns.scatterplot(data = df_num, x = 'Annual Income', y = 'Spending Score', hue = 'kmeans_id', palette = 'tab10', s = 60)
sns.scatterplot(kmeans.cluster_centers_[:,1], kmeans.cluster_centers_[:,2], ax = ax1, color = 'midnightblue', marker = 'X', s = 180)
ax1.patch.set_edgecolor('black')
ax1.patch.set_linewidth(1.5)

plt.subplot(122)
ax2 = sns.scatterplot(data = df_num, x = 'Age', y = 'Annual Income', hue = 'kmeans_id', palette = 'tab10', s = 60)
sns.scatterplot(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], ax = ax2, color = 'midnightblue', marker = 'X', s = 180)
ax2.patch.set_edgecolor('black')
ax2.patch.set_linewidth(1.5)

In [None]:
df_clusters['kmeans_id'] = df_clusters['kmeans_id'].astype('str')

In [None]:
#visualizing all three variables
fig = px.scatter_3d(df_clusters, x = 'Age', y = 'Annual Income', z = 'Spending Score',
              color='kmeans_id', color_continuous_scale = 'hsv', category_orders = {'kmeans_id': ['0', '1', '2', '3', '4']})
fig.update_layout(legend=dict(orientation="h"))
fig.show()

## Hierarchical Clustering

In [None]:
#using complete linkage
plt.figure(figsize = (16, 9))

Z = linkage(df_num, method="complete", metric='euclidean')
dendrogram(Z)
plt.axhline(6, ls="--", c = "r")
plt.title('Dendrogram', fontsize = 16, fontweight = 'bold')
plt.show()



> If we cut the dendrogram at y = 6, then we will get 5 clusters.



In [None]:
#5 clusters
hc_labels = cut_tree(Z, height = 6).reshape(-1, )
hc_labels

In [None]:
#assign the label
df_clusters['hc_id'] = hc_labels
df_num['hc_id'] = hc_labels

df_clusters.head()

### Interpreting the Clusters

In [None]:
#Age
plt.figure(figsize = [12,8])
box_plot('hc_id', 'Age')

In [None]:
#Annual Income
plt.figure(figsize = [12,8])
box_plot('hc_id', 'Annual Income')

In [None]:
#Spending Score
plt.figure(figsize = [12,8])
box_plot('hc_id', 'Spending Score')

In [None]:
#centrod of HC clusters
hc_centroids = [df_num.loc[df_num.hc_id == i, num_cols].mean().values for i in range(5)]
hc_centroids = np.array(hc_centroids)
hc_centroids

In [None]:
plt.figure(figsize = [18, 8])

plt.subplot(121)
ax1 = sns.scatterplot(data = df_num, x = 'Annual Income', y = 'Spending Score', hue = 'hc_id', palette = 'tab10', s = 60)
sns.scatterplot(hc_centroids[:,1], hc_centroids[:,2], ax = ax1, color = 'midnightblue', marker = 'X', s = 180)
ax1.patch.set_edgecolor('black')
ax1.patch.set_linewidth(1.5)

plt.subplot(122)
ax2 = sns.scatterplot(data = df_num, x = 'Age', y = 'Annual Income', hue = 'hc_id', palette = 'tab10', s = 60)
sns.scatterplot(hc_centroids[:,0], hc_centroids[:,1], ax = ax2, color = 'midnightblue', marker = 'X', s = 180)
ax2.patch.set_edgecolor('black')
ax2.patch.set_linewidth(1.5)



> We have obtained almost simillar clusters with agglomerative Hierarchical Clustering.



In [None]:
df_clusters['hc_id'] = df_clusters['hc_id'].astype('str')

In [None]:
#visualizing all three variables
fig = px.scatter_3d(df_clusters, x = 'Age', y = 'Annual Income', z = 'Spending Score',
              color='hc_id', color_continuous_scale = 'hsv', category_orders = {'hc_id': ['0', '1', '2', '3', '4']})
fig.update_layout(legend=dict(orientation="h"))
fig.show()

## DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps = 0.85, min_samples = 3)
dbscan.fit(df_num[cols])

In [None]:
dbscan.labels_

In [None]:
#assign the label
df_clusters['dbscan_id'] = dbscan.labels_
df_num['dbscan_id'] = dbscan.labels_

df_clusters.head()

In [None]:
plt.figure(figsize = [18, 8])

plt.subplot(121)
ax1 = sns.scatterplot(data = df_num, x = 'Annual Income', y = 'Spending Score', hue = 'dbscan_id', palette = 'tab10', s = 60)
ax1.patch.set_edgecolor('black')
ax1.patch.set_linewidth(1.5)

plt.subplot(122)
ax2 = sns.scatterplot(data = df_num, x = 'Age', y = 'Annual Income', hue = 'dbscan_id', palette = 'tab10', s = 60)
ax2.patch.set_edgecolor('black')
ax2.patch.set_linewidth(1.5)

## GMM

In [None]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=5, covariance_type = 'full', max_iter = 1000, random_state=21)
gmm_labels = gmm.fit_predict(df_num[cols])

In [None]:
#assign the label
df_clusters['gmm_id'] = gmm_labels
df_num['gmm_id'] = gmm_labels

df_clusters.head()

In [None]:
plt.figure(figsize = [18, 8])

plt.subplot(121)
ax1 = sns.scatterplot(data = df_num, x = 'Annual Income', y = 'Spending Score', hue = 'gmm_id', palette = 'tab10', s = 60)
sns.scatterplot(gmm.means_[:,1], gmm.means_[:,2], ax = ax1, color = 'midnightblue', marker = 'X', s = 180)
ax1.patch.set_edgecolor('black')
ax1.patch.set_linewidth(1.5)

plt.subplot(122)
ax2 = sns.scatterplot(data = df_num, x = 'Age', y = 'Annual Income', hue = 'gmm_id', palette = 'tab10', s = 60)
sns.scatterplot(gmm.means_[:,0], gmm.means_[:,1], ax = ax2, color = 'midnightblue', marker = 'X', s = 180)
ax2.patch.set_edgecolor('black')
ax2.patch.set_linewidth(1.5)

In [None]:
df_clusters['gmm_id'] = df_clusters['gmm_id'].astype('str')

In [None]:
#visualizing all three variables
fig = px.scatter_3d(df_clusters, x = 'Age', y = 'Annual Income', z = 'Spending Score',
              color='gmm_id', color_continuous_scale = 'hsv', category_orders = {'gmm_id': ['0', '1', '2', '3', '4']})
fig.update_layout(legend=dict(orientation="h"))
fig.show()