In [23]:
from time import time
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans

In [24]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
print('加载的20新闻数据中的数据类别为:',categories)

dataset = fetch_20newsgroups(data_home='datas', subset='all', categories=categories,
                             shuffle=True, random_state=42)
print("%d条数据；%d个新闻类别" % (len(dataset.data), len(dataset.target_names)))

labels = dataset.target

加载的20新闻数据中的数据类别为: ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
3387条数据；4个新闻类别


In [25]:
target_cluster_k = np.unique(labels).shape[0]
features = 2 ** 20
components = 5
mini_batch_km_batchsize = 1000

In [26]:

hasher1 = HashingVectorizer(n_features=features, stop_words='english', non_negative=True, 
                            norm=None, binary=False, token_pattern=u'(?u)\\b\\w\\w+\\b')
tt = TfidfTransformer(norm='l2', use_idf=True)
hasher2 = HashingVectorizer(n_features=features, stop_words='english', non_negative=False,
                            norm='l2', binary=False, token_pattern=u'(?u)\\b\\w\\w+\\b')
tv = TfidfVectorizer(max_df=0.5, max_features=features, min_df=2, stop_words='english', use_idf=True)


vectorizers = [
    ('hashing&tf-idf', make_pipeline(hasher1, tt), False),
    ('hasing', make_pipeline(hasher2), False),
    ('tf-idf', make_pipeline(tv), True)
]

In [30]:
svd = TruncatedSVD(n_components=components)
normalizer = Normalizer(copy=False)
sn = make_pipeline(svd, normalizer)

In [31]:

mbkm = MiniBatchKMeans(n_clusters=target_cluster_k, init='k-means++', n_init=5, 
                       init_size=10 * mini_batch_km_batchsize, batch_size=mini_batch_km_batchsize)

km = KMeans(n_clusters=target_cluster_k, init='k-means++', max_iter=100, n_init=5)

cluster_als = [('Mini-Batch-KMeans', mbkm), ('KMeans', km)]

In [42]:
for vectorizer_name, vectorizer, can_inverse in vectorizers:
    print("============================================")
    print("采用'%s'的方式将文本数据转换为特征矩阵" % vectorizer_name)
    
    t0 = time()
    X = vectorizer.fit_transform(dataset.data)
    print("转换消耗时间:%.3fs" % (time() - t0))
    print("样本数量:%d,特征属性数量:%d" % X.shape)
    
    t0 = time()
    X = sn.fit_transform(X)
    print("SVD分解及归一化消耗时间:%.3fs" % (time() - t0))
    print("降维&归一化操作后，样本数量:%d,特征属性数量:%d" % X.shape)
    
    
    for cluster_name, cluster_al in cluster_als:
        print()
        print("使用算法%s对数据进行建模操作" % cluster_name)
        t0 = time()
        cluster_al.fit(X)
        print("模型构建消耗时间:%.3fs" % (time() - t0))
        print("%s算法效果评估相关系数" % cluster_name)
        print(u"均一性/同质性: %0.3f" % metrics.homogeneity_score(labels, cluster_al.labels_))
        print("完整性: %0.3f" % metrics.completeness_score(labels, cluster_al.labels_))
        print("V-measure: %0.3f" % metrics.v_measure_score(labels, cluster_al.labels_))
        print("Adjusted Rand-Index(ARI): %.3f" % metrics.adjusted_rand_score(labels, cluster_al.labels_))
        print("轮廓系数: %0.3f" % metrics.silhouette_score(X, cluster_al.labels_, sample_size=1000))
        print("聚类中心点为:", cluster_al.cluster_centers_)
        
        if can_inverse:
            print("获取文本转换特征矩阵中，各个分类考虑特征属性的前10个feature特征（10个单词）：")
            
            original_space_centroids = svd.inverse_transform(cluster_al.cluster_centers_)
            
            order_centroids = original_space_centroids.argsort()[:, ::-1]
        
            terms = list(vectorizer.named_steps.items())[0][1].get_feature_names()
           
            for i in range(target_cluster_k):
                print("类别%d:" % i,)
                for ind in order_centroids[i, :10]:
                    print(' %s' % terms[ind],)
                print()
    print()
    print()
print("==================算法完成======================")

采用'hashing&tf-idf'的方式将文本数据转换为特征矩阵
转换消耗时间:0.735s
样本数量:3387,特征属性数量:1048576
SVD分解及归一化消耗时间:4.638s
降维&归一化操作后，样本数量:3387,特征属性数量:5

使用算法Mini-Batch-KMeans对数据进行建模操作
模型构建消耗时间:0.028s
Mini-Batch-KMeans算法效果评估相关系数
均一性/同质性: 0.572
完整性: 0.598
V-measure: 0.585
Adjusted Rand-Index(ARI): 0.597
轮廓系数: 0.399
聚类中心点为: [[ 0.76695718 -0.458312   -0.04362567  0.06973768  0.0616817 ]
 [ 0.73214048 -0.31031962  0.00814615 -0.40860922  0.36895898]
 [ 0.85664217  0.22066209 -0.02394716 -0.20907965 -0.24448361]
 [ 0.6017225   0.33583113  0.15689202  0.26267315  0.20718269]]

使用算法KMeans对数据进行建模操作
模型构建消耗时间:0.035s
KMeans算法效果评估相关系数
均一性/同质性: 0.560
完整性: 0.588
V-measure: 0.574
Adjusted Rand-Index(ARI): 0.571
轮廓系数: 0.393
聚类中心点为: [[ 0.86348422  0.2074284  -0.02752847 -0.20232829 -0.22822292]
 [ 0.73371358 -0.51746847 -0.05401306  0.11662839  0.04835384]
 [ 0.75139166 -0.30350512  0.00840286 -0.38096948  0.35089777]
 [ 0.59149652  0.33016349  0.16872237  0.27187811  0.2143385 ]]


采用'hasing'的方式将文本数据转换为特征矩阵
转换消耗时间:0.651s
样本数量:3387