In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings(action="ignore")

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory


## Customers Segmentation based on their Credit Card usage behaviour

Dataset for this notebook consists of credit card usage behavior of customers with 18 behavioral features. Segmentation of customers can be used to define marketing strategies.


**Content of this Kernel:**

* Data Preprocessing
* Clustering using KMeans
* Interpretation of Clusters
* Visualization of Clusters using PCA

In [22]:
df= pd.read_csv("CC GENERAL.csv")
data=df
print(df.shape)
data.head()

(8950, 18)


Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,C10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
3,C10004,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0,12
4,C10005,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12


Data Preprocessing

Descriptive Statistics of Data

In [23]:
data.describe()

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
count,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8949.0,8950.0,8637.0,8950.0,8950.0
mean,1564.474828,0.877271,1003.204834,592.437371,411.067645,978.871112,0.490351,0.202458,0.364437,0.135144,3.248827,14.709832,4494.44945,1733.143852,864.206542,0.153715,11.517318
std,2081.531879,0.236904,2136.634782,1659.887917,904.338115,2097.163877,0.401371,0.298336,0.397448,0.200121,6.824647,24.857649,3638.815725,2895.063757,2372.446607,0.292499,1.338331
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,0.019163,0.0,6.0
25%,128.281915,0.888889,39.635,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,1.0,1600.0,383.276166,169.123707,0.0,12.0
50%,873.385231,1.0,361.28,38.0,89.0,0.0,0.5,0.083333,0.166667,0.0,0.0,7.0,3000.0,856.901546,312.343947,0.0,12.0
75%,2054.140036,1.0,1110.13,577.405,468.6375,1113.821139,0.916667,0.3,0.75,0.222222,4.0,17.0,6500.0,1901.134317,825.485459,0.142857,12.0
max,19043.13856,1.0,49039.57,40761.25,22500.0,47137.21176,1.0,1.0,1.0,1.5,123.0,358.0,30000.0,50721.48336,76406.20752,1.0,12.0


In [None]:
# Customer ID definitely has no meaning at all in terms of building a clustering model (or any prediction model).
# So the first thing we want to do is to drop this customer Id column.
data.drop(['CUST_ID'], axis=1, inplace=True)

In [25]:
# As a next step, we would also like to see how many null values are there, and if there is one, we will need to handle them.
data.isnull().sum()

BALANCE                               0
BALANCE_FREQUENCY                     0
PURCHASES                             0
ONEOFF_PURCHASES                      0
INSTALLMENTS_PURCHASES                0
CASH_ADVANCE                          0
PURCHASES_FREQUENCY                   0
ONEOFF_PURCHASES_FREQUENCY            0
PURCHASES_INSTALLMENTS_FREQUENCY      0
CASH_ADVANCE_FREQUENCY                0
CASH_ADVANCE_TRX                      0
PURCHASES_TRX                         0
CREDIT_LIMIT                          1
PAYMENTS                              0
MINIMUM_PAYMENTS                    313
PRC_FULL_PAYMENT                      0
TENURE                                0
dtype: int64

In [26]:
# The easiest way to handle the columns with null values is either to drop those rows, or fill with mean value.
# Here, we would just fill it with mean value
data = data.fillna(data.mean())

# Verify all the columns are filled
data.isnull().sum()

BALANCE                             0
BALANCE_FREQUENCY                   0
PURCHASES                           0
ONEOFF_PURCHASES                    0
INSTALLMENTS_PURCHASES              0
CASH_ADVANCE                        0
PURCHASES_FREQUENCY                 0
ONEOFF_PURCHASES_FREQUENCY          0
PURCHASES_INSTALLMENTS_FREQUENCY    0
CASH_ADVANCE_FREQUENCY              0
CASH_ADVANCE_TRX                    0
PURCHASES_TRX                       0
CREDIT_LIMIT                        0
PAYMENTS                            0
MINIMUM_PAYMENTS                    0
PRC_FULL_PAYMENT                    0
TENURE                              0
dtype: int64

In [27]:
# K-mean clustering model uses "distances" between data points to put them into groups
# Logically, for it to work well, we will need to standardize the unit of the "distances"
# Therefore, we would like to scale the dataset. Here, again, we would use the one of the most commonly used method - StandardScaler.
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(data)
data_scaled = scaler.transform(data)

In [28]:
#PART 3

In [None]:
from sklearn.cluster import KMeans

# Let's start with 4 clusters
kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(data_scaled)

In [16]:
# Usually after building the model, you would like to evaluate it with different metrics
# In general:
# Decreasing the WCSS is the key objective
# Silhouette coefficient should be nearer to +1
# Lower the value of Davies-Bouldin Index would improve the performance.

# We are not going to go through these metrics here. If you are interested, please do read about the math behind these metrics.

from sklearn.metrics import silhouette_score,calinski_harabasz_score,davies_bouldin_score

labels = kmeans.fit_predict(data)

print("Silhouette Coefficient: %0.3f" % silhouette_score(data, labels))
print("Calinski-Harabasz Index: %0.3f" % calinski_harabasz_score(data, labels))
print("Davies-Bouldin Index: %0.3f" % davies_bouldin_score(data, labels))

Silhouette Coefficient: 0.316
Calinski-Harabasz Index: 2015.345
Davies-Bouldin Index: 1.150


In [29]:
#PART 4

In [None]:
# Usually people would use Elbow Method with WCSS (Within-Cluster Sum of Square).
# We would also start with that

# To do that, we would need to build multiple models with varying "number of clusters (i.e. K)"
# We can extract the wcss value from the model by using the inertia_ field
# We would then put value of WCSS in each of model into an array and visualize them later

wcss = []

for i in range(2, 16): 
    kmeans = KMeans(n_clusters = i, random_state = 0)
    kmeans.fit(data) 
    wcss.append(kmeans.inertia_)
    

In [None]:
# After having the series of WCSS values, we can then plot it out, and see what is the optimal K value.

import matplotlib.pyplot as plt

kvalue = range(2, 16)
plt.plot(kvalue, wcss, marker='x')
plt.show()

In [None]:
# Let's try to plot other metrics in the same chart as well

silhouette = []
davies_bouldin = []

for i in range(2, 16): 
    kmeans = KMeans(n_clusters = i, random_state = 0)
    labels = kmeans.fit_predict(data)
    silhouette.append(silhouette_score(data, labels))
    davies_bouldin.append(davies_bouldin_score(data, labels))

In [None]:
kvalue = range(2, 16)
plt.plot(kvalue, silhouette, marker='o')
plt.plot(kvalue, davies_bouldin, marker='*')
plt.show()