<a href="https://colab.research.google.com/github/DLPY/Unsupervised-Learning-Session-1/blob/main/K_Means_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Clustering Customers based on Bank Account Data**

Detail on Data: https://www.kaggle.com/shrutimechlearn/churn-modelling

## **TODO: Download source data from Github**
!wget https://github.com/DLPY/Classification_Session_1/blob/815d80d7c1367925bc148cf698738537d7bdc1c0/Churn_Modelling.csv

In [None]:
!pip install --upgrade kneed
!pip install colorama
!wget https://raw.githubusercontent.com/DLPY/Classification_Session_1/main/Churn_Modelling.csv

### **1. Import necessary packages for performing K-Means Clustering**

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from kneed import KneeLocator
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin, silhouette_score
import seaborn as sns
from sklearn.preprocessing import StandardScaler,normalize, MinMaxScaler
from yellowbrick.cluster import SilhouetteVisualizer
from termcolor import colored
import missingno as msno 
import colorama
from colorama import Fore, Style  # maakes strings colored
from termcolor import colored, cprint

%matplotlib inline

# pd.set_option('display.max_colwidth', None)

### **2. Read data from csv file into Pandas dataframe**

In [None]:
df = pd.read_csv('Churn_Modelling.csv')

In [None]:
df.head(5)

In [None]:
def first_looking(df):
    print(colored("Shape:", attrs=['bold']), df.shape,'\n',
          f"There is ", df.shape[0], " observation and ", df.shape[1], " columns in the dataset.", '\n',
          colored('-'*79, 'red', attrs=['bold']),
          colored("\nInfo:\n", attrs=['bold']), sep='')
    print(df.info(), '\n', 
          colored('-'*79, 'red', attrs=['bold']), sep='')
    print(colored("Number of Uniques:\n", attrs=['bold']), df.nunique(),'\n',
          colored('-'*79, 'red', attrs=['bold']), sep='')
    print(colored("All Columns:", attrs=['bold']), list(df.columns),'\n', 
          colored('-'*79, 'red', attrs=['bold']), sep='')

    df.columns= df.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')

    print(colored("Columns after rename:", attrs=['bold']), list(df.columns),'\n',
              colored('-'*79, 'red', attrs=['bold']), sep='')
###############################################################################

In [None]:
first_looking(df)

### **3. Clustering of Customers based on Estimated Salary and Balance in the Bank**

In [None]:
df.head(5)

In [None]:
#features = df[['creditscore', 'countrycode', 'gendercode',  'age', 'tenure', 'balance', 'numofproducts', 'hascrcard', 'isactivemember', 'estimatedsalary']]
features = df[['balance', 'estimatedsalary']]

In [None]:
features.head(5)

In [None]:
# scale features that have a wide range of data points
#scaler = MinMaxScaler()
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features[['balance', 'estimatedsalary']])

In [None]:
# Choosing the Appropriate Number of Clusters
kmeans_kwargs = {
    'init': 'k-means++',
    'n_init': 10,
    'max_iter': 200,
    'random_state': 42,
}

# A list holds the SSE values for each k
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(scaled_features)
    sse.append(kmeans.inertia_)

In [None]:
plt.style.use('fivethirtyeight')
plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('SSE')
plt.show()

In [None]:
# use kneed to programatically determine the number of clusters
kl = KneeLocator(
    range(1, 11), sse, curve='convex', direction='decreasing'
)

print('Optimal value for K using this method is {}'.format(kl.elbow))

In [None]:
fig, ax = plt.subplots(5,2, figsize=(10,15))
fig.tight_layout()
plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=0.4)

for i in range (2,12):
    '''
    Create KMeans instance for different number of clusters
    '''
    km = KMeans(n_clusters=i)
    q, mod = divmod(i, 2)

    labels = km.fit(scaled_features).labels_
    score_i = round(metrics.silhouette_score(scaled_features, labels, metric='euclidean'), 2)

    '''
    Create SilhouetteVisualizer instance with KMeans instance
    Fit the visualizer
    '''
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(scaled_features)
    visualizer.ax.set_title(str(i)  + " clusters - Silhouette score " + str(score_i))

In [None]:
# Note that the optimal value for K using this method is 4.
# Ultimately, the decision on the number of clusters to use should be
#   guided by a combination of domain knowledge and clustering evaluation metrics.

In [None]:
kmeans = KMeans(
    init='k-means++',
    n_clusters=4,
    n_init=10,
    max_iter=300,
    random_state=42
)

In [None]:
kmeans.fit(scaled_features)

In [None]:
# The lowest SSE value. A good model is one with low inertia AND a low number of clusters ( K ).
kmeans.inertia_

In [None]:
# Final locations of the centroid
kmeans.cluster_centers_

In [None]:
# The number of iterations required to converge
kmeans.n_iter_

In [None]:
# review a sample of the labels from the overall series of labels
kmeans.labels_[:5]

In [None]:
features['cluster'] = kmeans.labels_

In [None]:
features

In [None]:
features_agg = features.groupby("cluster")

In [None]:
features_agg_avg = features_agg.mean().reset_index()
features_agg_avg

In [None]:
fig = plt.figure(figsize=(15, 5))

ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.

width = 0.3

features_agg_avg.estimatedsalary.plot(kind='bar', ax=ax, width=width, position=1, color='red', label = 'Estimated Salary')
features_agg_avg.balance.plot(kind='bar', color='blue', ax=ax2, width=width, position=0, label = 'Balance')


ax.grid()
ax.set_xlabel('Cluster')
ax.tick_params(axis='x', rotation=0)
ax.set_ylabel('Estimated Salary')
ax2.set_ylabel('Balance')
ax.set_ylim(10,200000)
ax2.set_ylim(100,200000)
plt.title("Cluster Vs ( Avg Balance & Estimated Salary)")

# ask matplotlib for the plotted objects and their labels
lines, labels = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax.legend(loc=0)
ax2.legend(lines + lines2, labels + labels2, loc=0)
plt.show()

## **4. Summary of K-Means Clstering**


1. Cluster 0 has average estimatedsalary of 149743.41 and balance of 121991.56.
2. Cluster 1 has average estimatedsalary of 50472.47 and balance of 121880.26 .
3. Cluster 2 has average estimatedsalary of 149225.68 and balance of 2682.50.
4. Cluster 3 has average estimatedsalary of 49750.21 and balance of 2293.09.

Based on the requirement, banks can target the customers persona from different cluster's.