In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import jieba
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans

In [2]:
contents = [each.strip() for each in open("./index1.txt", encoding="utf-8").readlines()]
stopwords = set(each.strip() for each in open("./stopwords.txt", encoding="utf-8").readlines())
stopwords.add(" ")

In [3]:
def tokenize(text_list, stop):
    texts = []
    tokenized_texts = []
    for text in text_list:
        cut = [each for each in jieba.cut(text) if each not in stop and not re.match(r'\d+', each)]
        if cut:
            texts.append(text)
            tokenized_texts.append(cut)
    return texts, tokenized_texts

texts, tokenized_texts = tokenize(contents, stopwords)

inputs = [" ".join(each) for each in tokenized_texts]
vectorizer = TfidfVectorizer(max_features=1000)
text_vec = vectorizer.fit_transform(inputs)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\050443\AppData\Local\Temp\jieba.cache
Loading model cost 0.486 seconds.
Prefix dict has been built successfully.


In [9]:
def get_best_n_topics(n_candidates, data):
    sse = []
    for n in n_candidates:
        print(f"Kmeans with {n} centers")
        kmeans = KMeans(n_clusters=n, n_init=5)
        kmeans.fit(X=data)
        sse.append(kmeans.inertia_)
    return sse

def fit_kmeans(n, data):
    kmeans = KMeans(n_clusters=n, n_init=5, random_state=10)
    pred = kmeans.fit_transform(X=data)
    return kmeans, pred

In [5]:
# n = 7
# sse = get_best_n_topics(list(range(1, 11)), text_vec)
# plt.plot(sse)
# plt.show()

In [10]:
kmeans, pred = fit_kmeans(7, text_vec)

In [11]:
pred_cls = np.argmin(pred, axis=-1)

In [12]:
def get_key_words(text_vec, vectorizer):
    idx_to_word = {k: v for v, k in vectorizer.vocabulary_.items()}
    key_index = np.array(np.argmax(text_vec, axis=-1)).squeeze()
    return [idx_to_word[k] for k in key_index]
key_words = get_key_words(text_vec, vectorizer)

In [13]:
res = pd.DataFrame({"标题": texts, "关键词":key_words, "类别": pred_cls})
res.to_csv("分类结果.csv", index=False)

In [14]:
res

Unnamed: 0,标题,关键词,类别
0,姆巴佩爆抽2脚连刷俩球 世界杯进球超C罗追平梅西,姆巴,0
1,打破魔咒！梅西打入世界杯淘汰赛首球 总进球超C罗,首球,0
2,不敢相信!日本4个点球罚丢仨 南野三笘薰全软脚,点球,0
3,加时专业户!克罗地亚近8场大赛7场加时 还总能赢球,加时,0
4,天花板!日本队连续2届无缘8强 4次冲击全部败北,日本队,0
...,...,...,...
21252,日媒：森保一将连任日本男足主教练 带队征战下届世界杯,带队,0
21253,4年后美加墨世界杯扩军 中国男足看到曙光？,中国男足,0
21254,回眸2022之乒乓：老将弥坚新人蜕变 国乒诠释长盛不衰,老将,0
21255,CBA：2分险胜四川 北京首钢主帅解立彬用人调整获赞,调整,0
