In [6]:
# 导入所用的库
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.cluster import KMeans
from time import time
import numpy as np

In [7]:
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

print("Loading 20 newsgroups dataset for categories:")
print(categories)

dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42)

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
3387 documents
4 categories


In [8]:
labels = dataset.target
true_k = np.unique(labels).shape[0]
print(labels)
print(true_k)

[0 1 1 ... 2 1 1]
4


In [18]:
print(type(dataset.data))
print(dataset.data[0])

<class 'list'>
From: healta@saturn.wwc.edu (Tammy R Healy)
Subject: Re: who are we to judge, Bobby?
Lines: 38
Organization: Walla Walla College
Lines: 38

In article <1993Apr14.213356.22176@ultb.isc.rit.edu> snm6394@ultb.isc.rit.edu (S.N. Mozumder ) writes:
>From: snm6394@ultb.isc.rit.edu (S.N. Mozumder )
>Subject: Re: who are we to judge, Bobby?
>Date: Wed, 14 Apr 1993 21:33:56 GMT
>In article <healta.56.734556346@saturn.wwc.edu> healta@saturn.wwc.edu (TAMMY R HEALY) writes:
>>Bobby,
>>
>>I would like to take the liberty to quote from a Christian writer named 
>>Ellen G. White.  I hope that what she said will help you to edit your 
>>remarks in this group in the future.
>>
>>"Do not set yourself as a standard.  Do not make your opinions, your views 
>>of duty, your interpretations of scripture, a criterion for others and in 
>>your heart condemn them if they do not come up to your ideal."
>>                         Thoughts Fromthe Mount of Blessing p. 124
>>
>>I hope quoting this doe

In [10]:
print("Extracting features ......")
t0 = time()

Extracting features ......


In [11]:
vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
                                 min_df=2, stop_words='english')
X = vectorizer.fit_transform(dataset.data)
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)

done in 1.708865s
n_samples: 3387, n_features: 10000


In [12]:
print(X)

  (0, 2825)	0.04520567479618447
  (0, 1426)	0.07575320795576096
  (0, 8349)	0.04998796203184946
  (0, 1109)	0.08455246817382248
  (0, 4733)	0.08455246817382248
  (0, 2658)	0.08226616850753836
  (0, 2243)	0.08031575260409271
  (0, 6957)	0.05477744646671208
  (0, 9681)	0.06044054061483994
  (0, 3507)	0.08953739821545949
  (0, 2320)	0.05831236733334342
  (0, 9184)	0.04404807348832485
  (0, 3372)	0.054255653220676714
  (0, 2984)	0.02845277773309223
  (0, 6178)	0.06099495510090287
  (0, 2190)	0.09080540698179555
  (0, 8433)	0.07079512761524998
  (0, 6971)	0.06071486007349951
  (0, 6179)	0.06836951754105432
  (0, 2815)	0.08953739821545949
  (0, 7705)	0.080934655933792
  (0, 1084)	0.05326971909969163
  (0, 1769)	0.0587833553635613
  (0, 9874)	0.033418771247006686
  (0, 7915)	0.048748109937171095
  :	:
  (3386, 6719)	0.04636574358198852
  (3386, 207)	0.073055490407179
  (3386, 8926)	0.058413220065501995
  (3386, 6135)	0.05509573409226583
  (3386, 5416)	0.12261715332194234
  (3386, 4674)	0.1508

In [13]:
print("Performing dimensionality reduction using SVD......")
t0 = time()
svd = TruncatedSVD(n_components = 1200)
X = svd.fit_transform(X)
print("done in %fs" % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

Performing dimensionality reduction using LSA......
done in 14.098891s
Explained variance of the SVD step: 81%


In [14]:
print(X.shape)

(3387, 1200)


In [15]:
print("Do clustering......")
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))

Do clustering......
Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=4, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)
done in 1.270s


In [16]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

Homogeneity: 0.455
Completeness: 0.539
V-measure: 0.493
Adjusted Rand-Index: 0.394
Silhouette Coefficient: 0.009


In [17]:
print("Top terms per cluster:")
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:
Cluster 0: com graphics university posting host nntp image thanks computer know
Cluster 1: henry access toronto digex pat zoo spencer net zoology prb
Cluster 2: god com people sandvik keith jesus don article say morality
Cluster 3: space nasa gov alaska shuttle moon launch jpl just station
