# **基于博客标题文本信息的K-means 聚类**

In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
import jieba
import jieba.analyse
import re
import emoji

In [2]:
# jieba分词 精确模式
def get_jiebaword():
	# enconding视文本保存的编码而定，utf-8或gbk
    try:
        with open('happy.txt', "r", encoding='utf-8',errors='ignore') as fr:
            lines = fr.readlines()
    except FileNotFoundError:
        print("no file like this")
    jiebaword = []
    for line in lines:
        #line=re.sub('(\:.*?\:)', '', emoji.demojize(line))#清除标题中的emoji
        line=re.sub(u"([^\u4e00-\u9fa5])", '', line)#直接保留中文的Unicode编码
        line = line.strip('\n')
        # 清除多余的空格
        line = "".join(line.split())
        # 默认精确模式
        seg_list = jieba.cut(line, cut_all=False)
        word = "/".join(seg_list)
        jiebaword.append(word)
    return jiebaword


In [3]:
a=get_jiebaword()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\CYBERO~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.493 seconds.
Prefix dict has been built successfully.


In [7]:
# 获取停用词表
def get_stopword():
    stopword = []
    try:
        with open('C:/Users/Cyberolic/Desktop/stopwords.txt', "r", encoding='utf-8') as fr:
            lines = fr.readlines()
    except FileNotFoundError:
        print("no file like this")
    for line in lines:
        line = line.strip('\n')
        stopword.append(line)
    return stopword


In [8]:
#可以不加
b=get_stopword()
b.append('｜')
b.append('～')
b.append('『')
b.append('”')
b.append('』')
b.append('“')
b.append('❀')
b.append('℃')
b.append('❥')
b.append('°')
b.append('一')
b.append('の')

In [9]:
# 去除停用词
def clean_stopword(jiebaword,stopword):
    fw = open('CleanWords.txt', 'a+',encoding='utf-8')
    for words in jiebaword:
        words = words.split('/')
        for word in words:
            if word not in stopword:
                fw.write(word + '\t')
        fw.write('\n')
    fw.close()


In [13]:
clean_stopword(a,b)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [14]:
# 生成tf-idf矩阵文档
def get_tfidf():
    try:
        with open('CleanWords.txt', "r", encoding='utf-8') as fr:
            lines = fr.readlines()
    except FileNotFoundError:
        print("no file like this")
    transformer=TfidfVectorizer()
    tfidf = transformer.fit_transform(lines)
    # 转为数组形式
    tfidf_arr = tfidf.toarray()
    return tfidf_arr


In [15]:
tfidf_arr=get_tfidf()
print(tfidf_arr)
print(tfidf_arr.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(1568, 3799)


In [43]:
from nltk.cluster import KMeansClusterer, cosine_distance

In [44]:
# K-means聚类
def get_cluster(tfidf_arr,k):
    kmeans = KMeansClusterer(num_means=k, distance=cosine_distance)  # 分成k类，使用余弦相似分析
    kmeans.cluster(tfidf_arr)
    # 获取分类
    kinds = pd.Series([kmeans.classify(i) for i in tfidf_arr])
    fw = open('ClusterText.txt', 'a+', encoding='utf-8')
    for i, v in kinds.items():
        fw.write(str(i) + '\t' + str(v) + '\n')
    fw.close()


In [103]:
get_cluster(tfidf_arr,4)

In [68]:
# 获取分类文档
def cluster_text():
    index_cluser = []
    try:
        with open('ClusterText.txt', "r", encoding='utf-8') as fr:
            lines = fr.readlines()
    except FileNotFoundError:
        print("no file like this")
    for line in lines:
        line = line.strip('\n')
        line = line.split('\t')
        index_cluser.append(line)
    # index_cluser[i][j]表示第i行第j列
    try:
        with open('CleanWords.txt', "r", encoding='utf-8') as fr:
            lines = fr.readlines()
    except FileNotFoundError:
        print("no file like this")
    for index,line in enumerate(lines):
        for i in range(1567):
            if str(index) == index_cluser[i][0]:
                fw = open('cluster' + index_cluser[i][1] + '.txt', 'a+', encoding='utf-8')
                fw.write(line)
    fw.close()


In [104]:
cluster_text()

In [72]:
from collections import Counter

In [73]:
# 获取主题词
def get_title(cluster):
    for i in range(cluster):
        try:
            with open('cluster' + str(i) + '.txt', "r", encoding='utf-8') as fr:
                lines = fr.readlines()
        except FileNotFoundError:
            print("no file like this")
        all_words = []
        for line in lines:
            line = line.strip('\n')
            line = line.split('\t')
            for word in line:
                all_words.append(word)
        c = Counter()
        for x in all_words:
            if len(x) > 1 and x != '\r\n':
                c[x] += 1

        print('主题' + str(i+1) + '\n词频统计结果：')
        # 输出词频最高的那个词，也可以输出多个高频词
        for (k, v) in c.most_common(1):  
            print(k,':',v,'\n')


In [105]:
get_title(4)

主题1
词频统计结果：
攻略 : 82 

主题2
词频统计结果：
三亚 : 129 

主题3
词频统计结果：
自驾 : 113 

主题4
词频统计结果：
周末 : 20 

