In [1]:
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
def jieba_tokenize(text):
    return jieba.lcut(text)

In [3]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenize, lowercase=False)
"""
tokenizer:指定分词函数
lowercase:在分词之前将所有的文本转换成小写，由于是中文处理，所以不需要这一步，
设置该参数为False
"""
text_list = ["我喜欢自然语言处理", "明天去跑步", "从事机器学习与自然语言处理工作",
            "我得到了工作", "跑步与打篮球"]
# 需要进行聚类的语料库
tfidf_matrix = tfidf_vectorizer.fit_transform(text_list)

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\MSWQPC\AppData\Local\Temp\jieba.cache
Loading model cost 1.872 seconds.
Prefix dict has been built successfully.


In [5]:
tfidf_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.5819515 ,
        0.4695148 , 0.        , 0.        , 0.        , 0.4695148 ,
        0.        , 0.        , 0.        , 0.4695148 , 0.        ],
       [0.        , 0.        , 0.        , 0.61418897, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.61418897, 0.        , 0.        , 0.49552379],
       [0.34082108, 0.        , 0.42243894, 0.        , 0.        ,
        0.34082108, 0.42243894, 0.34082108, 0.        , 0.        ,
        0.        , 0.        , 0.42243894, 0.34082108, 0.        ],
       [0.        , 0.55032913, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.44400208, 0.55032913, 0.44400208,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.53177225, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.659118  , 0.        , 0.        , 

In [7]:
num_clusters = 2
km_cluster = KMeans(n_clusters=num_clusters, max_iter=300,
                   init="k-means++", n_jobs=-1)
###参数说明###
"""
n_clusters：指定需要的簇数
max_iter：对于单次初始值计算的最大迭代次数
init：指定初始值选择的算法，默认为“k-means++”
n_jobs：进程个数，-1表示使用所有的cpu资源
"""
# 返回每个文本所被分配到的簇的索引
result = km_cluster.fit_predict(tfidf_matrix)
print(result)

[0 1 0 0 1]


### 保存模型

In [10]:
from sklearn.externals import joblib
joblib.dump(tfidf_vectorizer, "tfidf_fit_result.pkl")
joblib.dump(km_cluster, "km_cluster_fit_result.pkl")

['km_cluster_fit_result.pkl']

In [11]:
# 载入模型
tfidf_vectorizer = joblib.load("tfidf_fit_result.pkl")
km_cluster = joblib.load("km_cluster_fit_result.pkl")

In [12]:
km_cluster

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)