In [1]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import pickle

In [2]:
database = pd.read_csv('../../../database/credit_db.csv',header = 1)

In [3]:
database.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [4]:
database.shape

(30000, 25)

In [5]:
with open('../../../database/credit_db.pkl', mode='rb') as f:
    credit_db = pickle.load(f)

In [6]:
credit_db

array([[-1.13672015, -1.24601985,  1.87637834, ..., -0.58646786,
        -0.5595361 , -0.63662272],
       [-0.3659805 , -1.02904717,  1.87637834, ..., -0.5294098 ,
        -0.53857195, -0.62268393],
       [-0.59720239, -0.16115646, -0.53294156, ..., -0.35612176,
        -0.38415872, -0.40636546],
       ...,
       [-1.05964618,  0.16430256,  1.87637834, ..., -0.27959295,
        -0.28926308, -0.54800865],
       [-0.67427636,  0.59824792,  1.87637834, ..., -1.26536789,
         0.22416754, -0.32707512],
       [-0.90549825,  1.14067961,  1.87637834, ..., -0.06744598,
        -0.32158213, -0.04307832]], shape=(30000, 11))

In [7]:
credit_db.shape

(30000, 11)

MANUALLY PREPROCESSED DATA

WCSS METRIC

In [8]:
m1_wcss = []
for i in range (2,11):
    m1_kmeans_credit = KMeans(n_clusters=i, n_init="auto", random_state=42)
    m1_kmeans_credit.fit(credit_db)
    m1_wcss.append(m1_kmeans_credit.inertia_)

In [9]:
m1_wcss_graph = px.line(x = range(2,11), y = m1_wcss)
m1_wcss_graph.show()

In [10]:
m1_wcss

[188848.56234133846,
 154795.82753488157,
 130080.03975895536,
 114989.63137273306,
 102882.70621595556,
 93215.21273630933,
 88192.2661793787,
 84716.25382828334,
 82477.16276078837]

SILHOUETTE SCORE METRIC

In [11]:
m1_s_score = []
for i in range (2,11):
    m1_kmeans_credit = KMeans(n_clusters=i,n_init='auto', random_state=42)
    m1_labels = m1_kmeans_credit.fit_predict(credit_db)
    m1_score = silhouette_score(credit_db, m1_labels)
    m1_s_score.append(m1_score)

In [12]:
m1_s_score_graph = px.line(x = range(2,11), y = m1_s_score)
m1_s_score_graph.show()

In [13]:
m1_s_score

[np.float64(0.5487401975667379),
 np.float64(0.39483313016664695),
 np.float64(0.3197235425914109),
 np.float64(0.2737008440346446),
 np.float64(0.27276630782490696),
 np.float64(0.27434944694896846),
 np.float64(0.26653219716060933),
 np.float64(0.2429929680455627),
 np.float64(0.24492272075017354)]

PCA DIMENTIONALITY REDUCTION

In [14]:
pca_db_scaled = database
scaler_credit = StandardScaler()
pca_db_scaled = scaler_credit.fit_transform(pca_db_scaled)

In [15]:
pca_db_scaled.shape

(30000, 25)

WCSS METRIC

In [16]:
m2_wcss = []
wcss_comp = [2,6,10,14]
wcss_clus = [2,5,8,10]
for i in wcss_comp:
    m2_wcss_pca = PCA(n_components=i, random_state=42)
    m2_wcss_db = m2_wcss_pca.fit_transform(pca_db_scaled)
    variance = sum(m2_wcss_pca.explained_variance_ratio_)
    for j in wcss_clus:
        m2_km_credit = KMeans(n_clusters=j, n_init='auto', random_state=42)
        m2_km_credit.fit(m2_wcss_db)
        
        m2_wcss.append({
            'components':i,
            'clusters':j,
            'wcss':m2_km_credit.inertia_,
            'variance_explained':variance,
        })

In [17]:
m2_wcss

[{'components': 2,
  'clusters': 2,
  'wcss': 201612.70342040053,
  'variance_explained': np.float64(0.430202225810999)},
 {'components': 2,
  'clusters': 5,
  'wcss': 68445.54426910987,
  'variance_explained': np.float64(0.430202225810999)},
 {'components': 2,
  'clusters': 8,
  'wcss': 45479.91560098445,
  'variance_explained': np.float64(0.430202225810999)},
 {'components': 2,
  'clusters': 10,
  'wcss': 38903.24611521313,
  'variance_explained': np.float64(0.430202225810999)},
 {'components': 6,
  'clusters': 2,
  'wcss': 353513.5987426845,
  'variance_explained': np.float64(0.6333309516696514)},
 {'components': 6,
  'clusters': 5,
  'wcss': 217900.163126432,
  'variance_explained': np.float64(0.6333309516696514)},
 {'components': 6,
  'clusters': 8,
  'wcss': 172824.31553637623,
  'variance_explained': np.float64(0.6333309516696514)},
 {'components': 6,
  'clusters': 10,
  'wcss': 160397.36011810182,
  'variance_explained': np.float64(0.6333309516696514)},
 {'components': 10,
  'c

In [18]:
m2_wcss_df = pd.DataFrame(m2_wcss)

In [19]:
m2_wcss_df

Unnamed: 0,components,clusters,wcss,variance_explained
0,2,2,201612.70342,0.430202
1,2,5,68445.544269,0.430202
2,2,8,45479.915601,0.430202
3,2,10,38903.246115,0.430202
4,6,2,353513.598743,0.633331
5,6,5,217900.163126,0.633331
6,6,8,172824.315536,0.633331
7,6,10,160397.360118,0.633331
8,10,2,463815.51369,0.780429
9,10,5,337286.758094,0.780429


In [20]:
m2_wcss_graph = px.scatter_matrix(m2_wcss_df, dimensions=[m2_wcss_df['clusters'],m2_wcss_df['components']],color=m2_wcss_df['wcss'])
m2_wcss_graph.show()

SILHOUETTE SCORE METRIC

In [21]:
m2_s_score = []
s_score_comp = [2,6,10,14]
s_score_clust = [2,5,8,10]
for i in s_score_comp:
    m2_s_score_pca = PCA(n_components=i, random_state=42)
    m2_s_score_db = m2_s_score_pca.fit_transform(pca_db_scaled)
    for j in s_score_clust:
        m2_km_credit_2 = KMeans(n_clusters = j, n_init='auto', random_state=42)
        m2_km_credit_2.fit(m2_s_score_db)
        m2_labels = m2_km_credit_2.fit_predict(m2_s_score_db)
        m2_score = silhouette_score(m2_s_score_db, m2_labels)

        m2_s_score.append({
            'components':i,
            'clusters':j,
            'silhouette_score': m2_score
            
        })

In [22]:
m2_s_score

[{'components': 2,
  'clusters': 2,
  'silhouette_score': np.float64(0.510283945253583)},
 {'components': 2,
  'clusters': 5,
  'silhouette_score': np.float64(0.46626476479679174)},
 {'components': 2,
  'clusters': 8,
  'silhouette_score': np.float64(0.4109750494389824)},
 {'components': 2,
  'clusters': 10,
  'silhouette_score': np.float64(0.3992269118949452)},
 {'components': 6,
  'clusters': 2,
  'silhouette_score': np.float64(0.39050600093379445)},
 {'components': 6,
  'clusters': 5,
  'silhouette_score': np.float64(0.2317416134043566)},
 {'components': 6,
  'clusters': 8,
  'silhouette_score': np.float64(0.19552589177763322)},
 {'components': 6,
  'clusters': 10,
  'silhouette_score': np.float64(0.18302811388518372)},
 {'components': 10,
  'clusters': 2,
  'silhouette_score': np.float64(0.3556551531228808)},
 {'components': 10,
  'clusters': 5,
  'silhouette_score': np.float64(0.19470309500844452)},
 {'components': 10,
  'clusters': 8,
  'silhouette_score': np.float64(0.1479411568

In [23]:
m2_s_score_df = pd.DataFrame(m2_s_score)

In [24]:
m2_s_score_df

Unnamed: 0,components,clusters,silhouette_score
0,2,2,0.510284
1,2,5,0.466265
2,2,8,0.410975
3,2,10,0.399227
4,6,2,0.390506
5,6,5,0.231742
6,6,8,0.195526
7,6,10,0.183028
8,10,2,0.355655
9,10,5,0.194703


In [25]:
m2_s_score_graph = px.scatter_matrix(m2_s_score_df, dimensions=[m2_s_score_df['clusters'],m2_s_score_df['components']],color=m2_s_score_df['silhouette_score'])
m2_s_score_graph.show()