Inital imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import plotly
import plotly.graph_objects as go
import plotly.express as px
from scipy.spatial.distance import cdist, pdist
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, normalize

plotly.offline.init_notebook_mode(connected=True)

Load data

In [2]:
cc_df = pd.read_csv(os.path.join('data', 'CC GENERAL.csv'))
cc_df.head()

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,C10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
3,C10004,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0,12
4,C10005,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12


Clean-up data

In [3]:
sanitized_df = cc_df.drop(columns=['CUST_ID']).fillna(0)  # drop customer id as it is unlikely to contain any useful data and fill missing values
sanitized_df['MINIMUM_PAYMENTS'] = sanitized_df['MINIMUM_PAYMENTS'].fillna(0)
sanitized_df['CREDIT_LIMIT'] = sanitized_df['CREDIT_LIMIT'].median()

scaler = StandardScaler()
scaled_df = scaler.fit_transform(sanitized_df)

Determine how many clusters are needed

In [7]:
k_summary = {}
k_values = []
silhouette_values = []

for k in range(2, 12):  # check between 2 and 12 clusters
    k_means_fit = KMeans(n_clusters=k, max_iter=300)
    k_means_fit.fit(scaled_df)
    k_values.append(k)
    silhouette_values.append(silhouette_score(scaled_df, k_means_fit.labels_, metric='euclidean'))

k_summary['k-values'] = k_values
k_summary['silhouette-values'] = silhouette_values
pd.DataFrame(k_summary)

Unnamed: 0,k-values,silhouette-values
0,2,0.221767
1,3,0.199824
2,4,0.211266
3,5,0.205883
4,6,0.219866
5,7,0.225589
6,8,0.24215
7,9,0.236122
8,10,0.240051
9,11,0.236136


The silhouette values show that we should consider 8 or 10 clusters. As a rule of thumb we will take the second highest silhouette value which would give us 10 clusters. We can now compare this result to the elbow method below.

In [8]:
num_of_clusters = range(1, 12)
k_means = [KMeans(n_clusters=cluster).fit(scaled_df) for cluster in num_of_clusters]
centroids = [k.cluster_centers_ for k in k_means]
distances = [cdist(scaled_df, centroid, 'euclidean') for centroid in centroids]
min_distances = [np.min(distance, axis=1) for distance in distances]
avg_within_cluster_sum_squares = [sum(distance) / scaled_df.shape[0] for distance in min_distances]
elbow_df = pd.DataFrame({'number_of_clusters': num_of_clusters, 'avg_within_cluster_sum_squares': avg_within_cluster_sum_squares})

fig = go.Figure(data=go.Scatter(x=elbow_df['number_of_clusters'], y=elbow_df['avg_within_cluster_sum_squares']), 
                layout=go.Layout(title='Average Within Cluster Sum of Squares', plot_bgcolor='rgba(0,0,0,0)', xaxis=dict(title='K means', spikedash='dash'), yaxis=dict(title='Sum of Squares', spikedash='dash')))
fig.show()

We can see an elbow point at 10 clusters where the slope levels off. We could also look at the percentage of variation explained instead of the sum of squares. This should give us the same answer.

In [9]:
total_sum_squares = sum(pdist(scaled_df)**2) / scaled_df.shape[0]
within_cluster_sum_squares = [sum(distance**2) for distance in min_distances]
between_cluster_sum_squares = total_sum_squares - within_cluster_sum_squares
var_explained_df = pd.DataFrame({'num_of_clusters': num_of_clusters, 'percent_var_explained': between_cluster_sum_squares / total_sum_squares * 100})

fig = go.Figure(data=go.Scatter(x=var_explained_df['num_of_clusters'], y=var_explained_df['percent_var_explained']), 
                layout=go.Layout(title='Percentage of Variance Explained', plot_bgcolor='rgba(0,0,0,0)', xaxis=dict(title='K means', spikedash='dash'), yaxis=dict(title='Percentage of Variance Explained', spikedash='dash')))
fig.show()