# 分词

### 从文件中读取词数据，并将词数据生成集合返回，可用来读取停用词集合等

In [52]:
def make_word_set(words_file_path):
    '''
    从文件中读取词数据，并将词数据生成集合返回
    :words_file_path 保存词数据的文件路径
    :return 词集合
    '''
    words_set = set()
    with open(words_file_path, 'r') as fp:
        for line in fp.readlines():
            word = line.strip().decode("utf-8")
            # 去掉重复的词
            if len(word)>0 and word not in words_set:
                words_set.add(word)
    return words_set

### 切词，并以样本格式保存

In [58]:
import jieba.posseg as posseg
import os

def cut_words(root_path, target_path, stopwords, all_flags):
    '''
    切词，并以样本格式保存
    :root_path 原始文本根目录路径
    :target_path 样本文档目标存储路径
    :stopwords 停用词集合
    :all_flags 允许保留的词性
    '''
    # 获取所有存储不同类别的新闻文本的目录名称
    class_names = os.listdir(root_path)
    # 打开样本目标文件，准备写入
    with open(target_path, 'w+') as fw:
        # 遍历所有存储不同类别的新闻文本的目录名称
        for class_name in class_names:
            # 拼装所有存储不同类别的新闻文本的目录路径
            class_dir_path = root_path + '/' + class_name
            # 获取类别目录下全部新闻文件名称
            document_names = os.listdir(class_dir_path)
            # 遍历全部新闻文件名称
            for document_name in document_names:
                # 拼装新闻文件路径
                document_path = class_dir_path + '/' + document_name
                # 打开每个新闻文件，读取内容
                with open(document_path, 'r') as fr:
                    # 读取新闻文件内容，并去除干扰字符
                    document_text = fr.read().strip().decode('utf-8').replace('\n', ' ').replace('\t', ' ')
                    # 切词
                    segs = posseg.cut(document_text)
                    # 将符合要求的词写入样本文件
                    content = ''
                    for word, flag in segs:
                        word = word.encode('utf-8').strip()
                        if flag in allow_flags and word not in stopwords:
                            content += word
                            content += ' '
                    fw.write(class_name + '\t' + content + '\n')

In [63]:
%%time
# 原始文本根目录路径
root_path = '/home/beanyon/Desktop/preprogress/thucnews/'
# root_path = '/home/beanyon/Desktop/naive_bayes_classifier/Database/SogouC/Sample.mini'
# 样本文档目标存储路径
target_path = '/home/beanyon/Desktop/preprogress/corpus.txt'
# 停用词文档路径
stopwords_path = '/home/beanyon/Desktop/preprogress/stopwords.txt'
# 允许保留的词性
allow_flags = ('v', 'n', 'a', 'ag', 'g', 'vg', 'ng', 'nr', 'ns', 'nt', 'nz', 'i')
# 读取停用词集合
stopwords = make_word_set(stopwords_path)
# 切词
cut_words(root_path, target_path, stopwords, allow_flags)