# Clustering 예제
## 신용카드를 소지한 고객데이터: 17개 변수로 구성 
예: 구매액, 현금서비스, 사용한도금액, 최소결제금액, 완납비율, 보유기간 등

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("creditcard.csv")
df = df.drop('CUST_ID',axis=1)
df = df.fillna(df.median())
xvar = df.columns

#### 결측치 확인 및 처리

In [None]:
df.isnull().sum().sum()

#### 우측 치우침이 심한 변수가 많음
#### 로그 변환을 수행 (0에는 로그를 취할 수 없으므로, 1을 더한 후 로그변환)

In [None]:
df1 = df
xvarlog = ['BALANCE', 'PURCHASES','ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',     
       'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS','MINIMUM_PAYMENTS']
df1[xvarlog] = np.log(1+df1[xvarlog])

#### 상관관계 강한 변수들 확인하여 필요하면 제거

In [None]:
xvar_drop = ['PURCHASES','PURCHASES_INSTALLMENTS_FREQUENCY','CASH_ADVANCE_TRX','PURCHASES_FREQUENCY']
df2 = df1.drop(xvar_drop, axis=1)
xvar2 = df2.columns

#### df2 는 최종 데이터, xvar2는 최종 변수명

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df2.corr(),cmap='coolwarm',annot=True)
plt.show()

# 신용카드 고객 세분화 (군집분석)
#### 군집분석을 위한 패키지 호출

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture

In [None]:
scaler = StandardScaler()
df2_std = scaler.fit_transform(df2)
df2_std = pd.DataFrame(df2_std)
df2_std.columns = xvar2

## K-means clustering

In [None]:
np.random.seed(0)
silhouette_kmean = [] 
for n_cluster in range(2, 11):
    clus = KMeans(n_clusters = n_cluster).fit_predict(df2_std)
    sil_score = silhouette_score(df2_std, clus) 
    silhouette_kmean.append((n_cluster, sil_score))
df_kmean = pd.DataFrame(silhouette_kmean, columns=['number_of_clusters', 'sil_score'])
df_kmean.sort_values('sil_score', ascending=False)

In [None]:
# Custumers per cluster
k = 3
np.random.seed(0)
kmean = KMeans(n_clusters=k, random_state=0)
clus = kmean.fit_predict(df2_std)
pd.Series(clus).value_counts()

In [None]:
X = df2.join(pd.DataFrame({'kmean_label': clus}))
X

In [None]:
plt.figure(figsize=(15,25))
for i in range(0,13):
    plt.subplot(6,3,i+1)
    sns.kdeplot(data=X, x=xvar2[i], hue="kmean_label", palette="Set2")
plt.show()

In [None]:
xvarsub = ['BALANCE','ONEOFF_PURCHASES','INSTALLMENTS_PURCHASES','CASH_ADVANCE','PURCHASES_TRX','kmean_label']
dfgraph = X[xvarsub]
sns.pairplot(dfgraph,hue="kmean_label",palette="Set2",plot_kws={'alpha': 0.01})
#sns.pairplot(dfgraph,kind="kde",hue="kmean_label",palette="Set2") # slow
plt.show()

In [None]:
dfgraph.groupby('kmean_label').mean()

### K-means 요약
#### Cluster 0 : 일시불 많이 구매하고,  구매횟수 높음
#### Cluster 1 : 청구액 적고, 현금서비스 적음
#### Cluster 2 : 할부금액 적고, 구매횟수 적고, 현금서비스 많음.

## Mean-Shift clustering

In [None]:
np.random.seed(0)
silhouette_meanshift = [] 
for bandwidth in np.arange(3.3, 3.3, 0.5):   ## slow
    meanshift = MeanShift(bandwidth=bandwidth)
    clus = meanshift.fit_predict(df2_std)
    sil_score = silhouette_score(df2_std, clus) 
    silhouette_meanshift.append((bandwidth, sil_score, len(set(meanshift.labels_)) ))
df_meanshift = pd.DataFrame(silhouette_meanshift, columns=['bandwidth', 'sil_score', 'number_of_clusters'])
df_meanshift.sort_values('sil_score', ascending=False)

In [None]:
np.random.seed(0)
bandwidth = 3.3
#meanshift = MeanShift(bandwidth=bandwidth, max_iter=100)  ## sklearn v0.24 이후
meanshift = MeanShift(bandwidth=bandwidth)  ## very slow
clus = meanshift.fit_predict(df2_std)
pd.Series(clus).value_counts()

In [None]:
X=df2.join(pd.DataFrame({'meanshift_label': clus}))
plt.figure(figsize=(15,25))
for i in range(0,13):
    plt.subplot(6,3,i+1)
    sns.kdeplot(data=X, x=xvar2[i], hue="meanshift_label", palette="Set2")
plt.show()

### Mean-Shift 요약
#### 의미있는 군집이 만들어지지 않음

## DBSCAN

In [None]:
np.random.seed(0)
siliuette_dbscan = []
for eps in np.arange(1.9,2.1,0.1):
    for min_sample in range(10,14,1):
        dbscan = DBSCAN(eps=eps, min_samples=min_sample)
        clus = dbscan.fit_predict(df2_std)
        sil_score = silhouette_score(df2_std, clus) 
        siliuette_dbscan.append((eps, min_sample, sil_score, len(set(dbscan.labels_))) )
df_dbscan = pd.DataFrame(siliuette_dbscan, columns=['eps', 'min_samples', 'sil_score', 'number_of_clusters'])
df_dbscan.sort_values('sil_score', ascending=False)

In [None]:
np.random.seed(0)
eps=2.1
min_samples=12
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
clus = dbscan.fit_predict(df2_std)
pd.Series(clus).value_counts()

### DBSCAN 요약
#### 의미있는 군집이 만들어지지 않음

## GMM

In [None]:
np.random.seed(0)
siliuette_GMM = []
for n_cluster in range(2,10,2):
    for covariance_type in ['tied', 'diag', 'full']:
        gmm  = GaussianMixture(n_components=n_cluster, covariance_type=covariance_type, random_state=0)
        clus = gmm.fit_predict(df2_std)
        sil_score = silhouette_score(df2_std, clus)
        siliuette_GMM.append((n_cluster, sil_score, covariance_type ) )        
df_gmm = pd.DataFrame(siliuette_GMM, columns=['number_of_clusters', 'sil_score','covariance_type'])
df_gmm.sort_values('sil_score', ascending=False)

In [None]:
np.random.seed(0)
gmm  = GaussianMixture(n_components=2, covariance_type='tied', random_state=0)
clus = gmm.fit_predict(df2_std)
pd.Series(clus).value_counts()

In [None]:
X = df2.join(pd.DataFrame({'GMM_label': clus}))
X

In [None]:
plt.figure(figsize=(15,25))
for i in range(0,13):
    plt.subplot(6,3,i+1)
    sns.kdeplot(data=X, x=xvar2[i], hue="GMM_label", palette="Set2")
plt.show()

In [None]:
xvarsub = ['BALANCE','CASH_ADVANCE','PURCHASES_TRX','GMM_label']
dfgraph = X[xvarsub]
sns.pairplot(dfgraph,kind="kde",hue="GMM_label",palette="Set2")
plt.show()

In [None]:
dfgraph.groupby('GMM_label').mean()

#### 2 군집의 차이가 가장 큰 변수는 '현금서비스'임. 그외  '청구액' , '할부구매액', '구매회수' 임

#### 군집1은 현금서비스 금액이 높은 그룹
#### 군집0은 현금서비스 금액이 낮은 그룹