In [1]:
# -*- coding:utf-8 -*-
import numpy as np
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=500,
                  n_features=6,
                  centers=5,
                  cluster_std=[0.4, 0.3, 0.4, 0.3, 0.4],
                  random_state=11)

### 接着我们看看默认的谱聚类的效果：

In [3]:
from sklearn.cluster import SpectralClustering
y_pred = SpectralClustering().fit_predict(X)
from sklearn import metrics
print "Calinski-Harabasz Score", metrics.calinski_harabaz_score(X, y_pred)

Calinski-Harabasz Score 14908.9325026


### 由于我们使用的是高斯核，那么我们一般需要对n_clusters和gamma进行调参。选择合适的参数值。代码如下：

In [4]:
for index, gamma in enumerate([0.01, 0.1, 1, 10]):
    for index, k in enumerate([3, 4, 5, 6]):
        y_pred = SpectralClustering(n_clusters=k, gamma=gamma).fit_predict(X)
        print "Calinski-Harabasz Score with gamma=", gamma, "n_clusters=", k,"score:", metrics.calinski_harabaz_score(X, y_pred) 

Calinski-Harabasz Score with gamma= 0.01 n_clusters= 3 score: 1979.77096092
Calinski-Harabasz Score with gamma= 0.01 n_clusters= 4 score: 3154.01841219
Calinski-Harabasz Score with gamma= 0.01 n_clusters= 5 score: 23410.63895
Calinski-Harabasz Score with gamma= 0.01 n_clusters= 6 score: 19296.8617974
Calinski-Harabasz Score with gamma= 0.1 n_clusters= 3 score: 1979.77096092
Calinski-Harabasz Score with gamma= 0.1 n_clusters= 4 score: 3154.01841219
Calinski-Harabasz Score with gamma= 0.1 n_clusters= 5 score: 23410.63895
Calinski-Harabasz Score with gamma= 0.1 n_clusters= 6 score: 19427.9618944
Calinski-Harabasz Score with gamma= 1 n_clusters= 3 score: 980.863594044
Calinski-Harabasz Score with gamma= 1 n_clusters= 4 score: 1227.88206114
Calinski-Harabasz Score with gamma= 1 n_clusters= 5 score: 23410.63895
Calinski-Harabasz Score with gamma= 1 n_clusters= 6 score: 1250.76566111




Calinski-Harabasz Score with gamma= 10 n_clusters= 3 score: 34.4675659539
Calinski-Harabasz Score with gamma= 10 n_clusters= 4 score: 37.621130103
Calinski-Harabasz Score with gamma= 10 n_clusters= 5 score: 30.5582744784
Calinski-Harabasz Score with gamma= 10 n_clusters= 6 score: 40.5064012421


### 可见最好的n_clusters是5，而最好的高斯核参数是1或者0.1.
    我们可以看看不输入可选的n_clusters的时候，仅仅用最优的gamma为0.1时候的聚类效果，代码如下：

In [5]:
y_pred = SpectralClustering(gamma=0.1).fit_predict(X)
print "Calinski-Harabasz Score", metrics.calinski_harabaz_score(X, y_pred) 

Calinski-Harabasz Score 14950.4939717


### 可见n_clusters一般还是调参选择比较好。