In [2]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.cluster import normalized_mutual_info_score

## 读取tweets数据集

In [3]:
tweets_text = []    #tweets_text中的元素为原每个tweet的正文
clusterId = []      #每个tweet实际所在的cluster
with open('Homework5Tweets.txt', encoding='utf-8') as file:
    for line in file:
        tweet = json.loads(line)
        tweets_text.append(tweet['text'])
        clusterId.append(tweet['cluster'])
# print(tweets_text)
# print(clusterId)

## 建立基于Tfidf的向量空间模型

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
tweets_str = [" ".join(tweets_text)]
# print(tweets_str)
# 创建transform
vectorizer = TfidfVectorizer()
# 分词并建立词汇表
vectorizer.fit(tweets_text)
# 结果输出
# print(vectorizer.vocabulary_)

vector = vectorizer.transform(tweets_text)
vs_array = vector.toarray()
# 输出编码后的向量信息
print(vector.shape)
print(type(vector))
print(vs_array)

(2472, 5097)
<class 'scipy.sparse.csr.csr_matrix'>
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## 聚类算法测试
![title](clustering_method.png)

In [6]:
import numpy as np
X = np.array(vs_array)
print(X)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## K-means

In [7]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=110, random_state=0).fit(X)
print(kmeans.labels_)
print(len(kmeans.labels_))
# for i in kmeans.labels_:
#     print(i)

[105  20  98 ...  52  38  11]
2472


### k-means结果评价

In [92]:
nmi_kmeans = normalized_mutual_info_score(clusterId, kmeans.labels_)
print('nmi_kmeans = ',nmi_kmeans)

nmi_kmeans =  0.7980301635709245


## AffinityPropagation

In [93]:
from sklearn.cluster import AffinityPropagation
clustering_ap = AffinityPropagation().fit(X)
print(clustering_ap)
print(clustering_ap.labels_)

AffinityPropagation(affinity='euclidean', convergence_iter=15, copy=True,
          damping=0.5, max_iter=200, preference=None, verbose=False)
[205  91 216 ... 224 145 115]


###  AffinityPropagation结果评价

In [94]:
nmi_ap = normalized_mutual_info_score(clusterId, clustering_ap.labels_)
print('nmi_ap = ',nmi_ap)

nmi_ap =  0.7831643850554082


## Mean-shift

In [87]:
from sklearn.cluster import MeanShift
clustering_ms = MeanShift(bandwidth=2).fit(X)
print(clustering_ms)
print(clustering_ms.labels_)

MeanShift(bandwidth=2, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)
[0 0 0 ... 0 0 0]


### Mean-shift结果评价

In [95]:
nmi_ms = normalized_mutual_info_score(clusterId, clustering_ms.labels_)
print('nmi_ms = ',nmi_ms)

nmi_ms =  -1.6132928326584306e-06


## Spectral clustering

In [96]:
from sklearn.cluster import SpectralClustering
clustering_sc = SpectralClustering(n_clusters=110, assign_labels="discretize", random_state=0).fit(X)
print(clustering_sc)
print(clustering_sc.labels_)

SpectralClustering(affinity='rbf', assign_labels='discretize', coef0=1,
          degree=3, eigen_solver=None, eigen_tol=0.0, gamma=1.0,
          kernel_params=None, n_clusters=110, n_init=10, n_jobs=1,
          n_neighbors=10, random_state=0)
[21  9  0 ... 81 53 54]


### spectral clustering结果评价

In [97]:
nmi_sc = normalized_mutual_info_score(clusterId, clustering_sc.labels_)
print('nmi_sc = ',nmi_sc)

nmi_sc =  0.7800448149457381


## ward hierarchical clustering
### 合并于AgglomerawtiveClustering中，默认linkage='ward'

In [98]:
from sklearn.cluster import AgglomerativeClustering
clustering_wh = AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward', memory=None, n_clusters=110).fit(X)
print(clustering_wh)
print(clustering_wh.labels_)

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward', memory=None, n_clusters=110,
            pooling_func=<function mean at 0x000001FB5269A9D8>)
[ 30  46   2 ...  62  44 102]


## ward hierarchical clustering结果评价

In [99]:
nmi_wh = normalized_mutual_info_score(clusterId, clustering_wh.labels_)
print('nmi_wh = ',nmi_wh)

nmi_wh =  0.77587403569932


## AgglomerativeClustering

In [100]:
from sklearn.cluster import AgglomerativeClustering
clustering_agg1 = AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='complete', memory=None, n_clusters=110,
            ).fit(X)
clustering_agg2 = AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='average', memory=None, n_clusters=110,
            ).fit(X)

## AgglomerativeClustering结果评价

In [101]:
nmi_agg_complete = normalized_mutual_info_score(clusterId, clustering_agg1.labels_)
nmi_agg_average = normalized_mutual_info_score(clusterId, clustering_agg2.labels_)
print('nmi_agg_complete = ',nmi_agg_complete)
print('nmi_agg_average = ',nmi_agg_average)

nmi_agg_complete =  0.7595908816134626
nmi_agg_average =  0.9004539868135747


## DBSCAN

In [108]:
from sklearn.cluster import DBSCAN
clustering_DBSCAN = DBSCAN(eps=0.02, min_samples=5).fit(X)
print(clustering_DBSCAN)
print(clustering_DBSCAN.labels_)

DBSCAN(algorithm='auto', eps=0.02, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=5, n_jobs=1, p=None)
[-1 -1 -1 ... -1 -1 -1]


## DBSCAN结果评价

In [109]:
nmi_dbscan = normalized_mutual_info_score(clusterId, clustering_DBSCAN.labels_)
print('nmi_dbscan = ',nmi_dbscan)

nmi_dbscan =  0.0914960470508469


## Gaussian mixture

In [112]:
from sklearn.mixture import GaussianMixture
model = GaussianMixture(n_components = 5, covariance_type = 'full')
model.fit(X)
label_pred = model.predict(X)

print(label_pred)

[0 0 0 ... 0 0 0]


## Gaussian mixture结果评价

In [113]:
nmi_gm = normalized_mutual_info_score(clusterId, label_pred)
print('nmi_gm = ',nmi_gm)

nmi_gm =  0.4506933941721244
