# 1. Iris 데이터 얻기

In [1]:
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()
feature_name = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
iris_df = pd.DataFrame(iris.data, columns=feature_name)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


# 2. k-means 군집화

In [2]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, random_state=10).fit(iris_df)

kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2], dtype=int32)

In [3]:
iris_df['cluster'] = kmeans.labels_

# 3. 실루엣 계수 확인 및 추가

In [4]:
from sklearn.metrics import silhouette_samples

score_samples = silhouette_samples(iris.data, iris_df['cluster'])
score_samples.shape

(150,)

In [5]:
print(score_samples)

[ 0.85258191  0.8149163   0.82879659  0.80435199  0.84891774  0.7477614
  0.82109757  0.8534934   0.75127806  0.82475199  0.80269525  0.83540396
  0.80996716  0.74535637  0.70224038  0.64338856  0.77526857  0.85063272
  0.70636742  0.81987458  0.78364122  0.82546665  0.79244412  0.79345251
  0.77426479  0.79799564  0.83296108  0.84162136  0.84325046  0.81723402
  0.8145665   0.79849826  0.76227879  0.72184605  0.82822866  0.83179477
  0.79374024  0.84148042  0.76779363  0.84992603  0.84902387  0.63767419
  0.78589195  0.79963167  0.74629437  0.80915172  0.81295565  0.81844011
  0.81783314  0.85168255 -0.02672203  0.36827154  0.08489767  0.59610609
  0.35669884  0.59416206  0.27019063  0.27245453  0.32755642  0.58167084
  0.38107057  0.58665651  0.55085685  0.47851679  0.56757079  0.30479553
  0.5596951   0.61137194  0.46029717  0.61379722  0.32789488  0.58776704
  0.30600028  0.49067062  0.49172115  0.3699518   0.10228902  0.13663605
  0.55169511  0.51160226  0.59709372  0.56598886  0.

# 4. 평균 실루엣 계수 구하기

In [7]:
from sklearn.metrics import silhouette_score

avg_score = silhouette_score(iris.data, iris_df['cluster'])
print(avg_score)

0.551191604619592


In [8]:
iris_df['silhouette_coeff'] = score_samples

In [9]:
iris_df.groupby('cluster')['silhouette_coeff'].mean()

cluster
0    0.436842
1    0.797604
2    0.422323
Name: silhouette_coeff, dtype: float64

# 5. 시각화

In [10]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import math
import numpy as np

In [11]:
def get_cluster_labels(n_clusters, X_features):
    cluster = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, random_state=10)
    return cluster.fit_predict(X_features)

In [12]:
def get_silhouette_score(X_features, cluster_labels):
    sil_avg = silhouette_score(X_features, cluster_labels)
    sil_values = silhouette_samples(X_features, cluster_labels)
    return sil_avg, sil_values

In [13]:
def set_axs(n_clusters, X_features, sil_avg, axs, idx):
    axs[idx].set_title('Number of Cluster : '+ str(n_clusters)+'\n' + 'Silhouette Score :' + str(round(sil_avg,3)))
    axs[idx].set_xlabel('The silhouette coefficient values')
    axs[idx].set_ylabel('Cluster label')
    axs[idx].set_xlim([-0.1, 1])
    axs[idx].set_ylim([0, len(X_features) + (n_clusters + 1) * 10])
    axs[idx].set_yticks([])
    axs[idx].set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])
    axs[idx].axvline(x=sil_avg, color='red', linestyle='--')

In [14]:
def show_bargraph(n_clusters, cluster_labels, sil_values, axs, idx):
    y_lower = 10
    for i in range(n_clusters):
        ith_cluster_sil_values = sil_values[cluster_labels == i]
        ith_cluster_sil_values.sort()

        size_cluster_i = ith_cluster_sil_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        axs[idx].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_sil_values, facecolor=color, edgecolor=color, alpha=0.7)
        axs[idx].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10

In [None]:
def visualize_silhouette(cluster_list, X_features):
    n_cols = len(cluster_list)
    fig, axs = plt.subplots(figsize=(4*n_cols, 4), nrows=1, ncols=n_cols)

    for ind, n_cluster in enumerate(cluster_list):
        cluster_labels = get_cluster_labels(n_cluster, X_features)
        sil_avg, sil_values = get_silhouette_score(X_features, cluster_labels)
        set_axs(n_cluster, X_features, sil_avg, axs, ind)
        show_bargraph(n_cluster, cluster_labels, sil_values, axs, ind)

> 위의 그래프 결과 클러스터가 2개일 경우 가장 높은 실루엣 점수를 갖게 되며 다음은 클러스터가 4개인 경우이다.

> iris의 경우 2개의 클러스터인 경우가 가장 높고 다음 순차적으로 스코어 점수가 작아지는 것을 볼 수 있다.