# 군집분석 (Dimension Reduction 데이터)

## 신용카드를 소지한 고객 데이터: 17개 변수로 구성 
예: 구매액, 현금서비스, 사용한도금액, 최소결제금액, 완납비율, 보유기간 등

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("creditcard.csv")
df = df.drop('CUST_ID',axis=1)

#### 결측치 확인 및 처리

In [None]:
df = df.fillna(df.median())
df.isnull().sum().sum()

#### 우측 치우침이 심한 변수가 많음
#### 로그 변환을 수행 (0에는 로그를 취할 수 없으므로, 1을 더한 후 로그변환)

In [None]:
xvar = df.columns
df1 = df
xvarlog = ['BALANCE', 'PURCHASES','ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',     
       'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS','MINIMUM_PAYMENTS']
df1[xvarlog] = np.log(1+df1[xvarlog])

#### 표준화

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
scaler = StandardScaler()
df1_std = scaler.fit_transform(df1)
df1_std = pd.DataFrame(df1_std)
df1_std.columns = xvar

## 차원축소

#### 주성분 2개 추출

In [None]:
pca2 = PCA(n_components=2) 
df1_pc2 = pca2.fit_transform(df1_std) 
df1_pc2 = pd.DataFrame(df1_pc2) 
df1_pc2.columns = ['PC1', 'PC2'] 
df1_pc2

In [None]:
plt.figure(figsize=(5,5))
sns.scatterplot(data=df1_pc2, x="PC1", y="PC2", alpha = 0.05)
plt.show()

#### t-SNE 방법

In [None]:
from sklearn.manifold import TSNE
np.random.seed(0)
tsne = TSNE(n_components=2, perplexity=30) # perlexity 는 이웃의 범위
df1_tsne2 = tsne.fit_transform(df1_std)

In [None]:
df1_tsne2 = pd.DataFrame(df1_tsne2) 
df1_tsne2.columns = ['tsne1', 'tsne2'] 
df1_tsne2

In [None]:
plt.figure(figsize=(5,5))
sns.scatterplot(data=df1_tsne2, x="tsne1", y="tsne2", alpha = 0.05)
plt.show()

# 신용카드 고객 세분화 (PC 데이터)
#### 군집분석을 위한 패키지 호출

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture

## K-means clustering

In [None]:
np.random.seed(0)
silhouette_kmean = [] 
for n_cluster in range(2, 11):
    clus = KMeans(n_clusters = n_cluster).fit_predict(df1_pc2)
    sil_score = silhouette_score(df1_pc2, clus) 
    silhouette_kmean.append((n_cluster, sil_score))
df_kmean = pd.DataFrame(silhouette_kmean, columns=['number_of_clusters', 'sil_score'])
df_kmean.sort_values('sil_score', ascending=False)

In [None]:
# Custumers per cluster
np.random.seed(0)
k = 2
kmean = KMeans(n_clusters=k, random_state=0)
clus = kmean.fit_predict(df1_pc2)
pd.Series(clus).value_counts()

In [None]:
X = df1_pc2.join(pd.DataFrame({'kmean_label': clus}))
plt.figure(figsize=(5,5))
sns.scatterplot(data=X, x="PC1", y="PC2", hue="kmean_label", alpha = 0.05)
plt.show()

## GMM

In [None]:
np.random.seed(0)
siliuette_GMM = []
for n_cluster in range(2,10,2):
    for covariance_type in ['tied', 'diag', 'full']:
        gmm  = GaussianMixture(n_components=n_cluster, covariance_type=covariance_type, random_state=0)
        clus = gmm.fit_predict(df1_pc2)
        sil_score = silhouette_score(df1_pc2, clus)
        siliuette_GMM.append((n_cluster, sil_score, covariance_type ) )        
df_gmm = pd.DataFrame(siliuette_GMM, columns=['number_of_clusters', 'sil_score','covariance_type'])
df_gmm.sort_values('sil_score', ascending=False)

In [None]:
np.random.seed(0)
gmm  = GaussianMixture(n_components=2, covariance_type='full', random_state=0)
clus = gmm.fit_predict(df1_pc2)
pd.Series(clus).value_counts()

In [None]:
X = df1_pc2.join(pd.DataFrame({'GMM_label': clus}))
plt.figure(figsize=(5,5))
sns.scatterplot(data=X, x="PC1", y="PC2", hue="GMM_label", alpha=0.05)
plt.show()

In [None]:
X=df1.join(pd.DataFrame({'GMM_label': clus}))
plt.figure(figsize=(15,25))
for i in range(0,13):
    plt.subplot(6,3,i+1)
    sns.kdeplot(data=X, x=xvar[i], hue="GMM_label", palette="Set2")
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.scatterplot(data=X, x="CASH_ADVANCE", y="INSTALLMENTS_PURCHASES", hue="GMM_label", alpha=0.05)
plt.show()

# 신용카드 고객 세분화 (t-SNE 데이터)

## K-means clustering

In [None]:
np.random.seed(0)
silhouette_kmean = [] 
for n_cluster in range(2, 11):
    clus = KMeans(n_clusters = n_cluster).fit_predict(df1_tsne2)
    sil_score = silhouette_score(df1_tsne2, clus) 
    silhouette_kmean.append((n_cluster, sil_score))
df_kmean = pd.DataFrame(silhouette_kmean, columns=['number_of_clusters', 'sil_score'])
df_kmean.sort_values('sil_score', ascending=False)

In [None]:
# Custumers per cluster
np.random.seed(0)
k = 6
kmean = KMeans(n_clusters=k, random_state=0)
clus = kmean.fit_predict(df1_tsne2)
pd.Series(clus).value_counts()

In [None]:
X = df1_tsne2.join(pd.DataFrame({'kmean_label': clus}))
plt.figure(figsize=(8,5))
sns.scatterplot(data=X, x="tsne1", y="tsne2", hue="kmean_label", alpha=0.05, palette='Set2')
plt.show()

In [None]:
X = df1.join(pd.DataFrame({'kmean_label': clus}))
plt.figure(figsize=(15,25))
for i in range(0,13):
    plt.subplot(6,3,i+1)
    sns.kdeplot(data=X, x=xvar[i], hue="kmean_label", palette="Set2")
plt.show()

## GMM

In [None]:
np.random.seed(0)
siliuette_GMM = []
for n_cluster in range(2,10,2):
    for covariance_type in ['tied', 'diag', 'full']:
        gmm  = GaussianMixture(n_components=n_cluster, covariance_type=covariance_type, random_state=0)
        clus = gmm.fit_predict(df1_tsne2)
        sil_score = silhouette_score(df1_tsne2, clus)
        siliuette_GMM.append((n_cluster, sil_score, covariance_type ) )        
df_gmm = pd.DataFrame(siliuette_GMM, columns=['number_of_clusters', 'sil_score','covariance_type'])
df_gmm.sort_values('sil_score', ascending=False)

In [None]:
np.random.seed(0)
gmm  = GaussianMixture(n_components=6, covariance_type='diag', random_state=0)
clus = gmm.fit_predict(df1_tsne2)
pd.Series(clus).value_counts()

In [None]:
X = df1_tsne2.join(pd.DataFrame({'GMM_label': clus}))
plt.figure(figsize=(8,5))
sns.scatterplot(data=X, x="tsne1", y="tsne2", hue="GMM_label", alpha=0.05, palette="Set2")
plt.show()

In [None]:
X=df1.join(pd.DataFrame({'GMM_label': clus}))
plt.figure(figsize=(15,25))
for i in range(0,13):
    plt.subplot(6,3,i+1)
    sns.kdeplot(data=X, x=xvar[i], hue="GMM_label", palette="Set2")
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.scatterplot(data=X, x="CASH_ADVANCE", y="INSTALLMENTS_PURCHASES", hue="GMM_label", alpha=0.2, palette="Set2")
plt.show()