## 先进行中文分词

In [5]:
import jieba
import os
from utils import files_processing

from warnings import filterwarnings
filterwarnings('ignore')

In [15]:
with open('百度停用词表.txt', 'r') as f:
    stopwords = f.read().split('\n')
print(stopwords[:5])

['--', '?', '“', '”', '》']


In [20]:
# 源文件所在目录
source_folder = './source'
segment_folder = './segment'

# 字词分割，对整个文件内容进行字词分割
def segment_lines(file_list,segment_out_dir,stopwords=[]):
    for i,file in enumerate(file_list):
        segment_out_name=os.path.join(segment_out_dir,'segment_{}.txt'.format(i))
        with open(file, 'rb') as f:
            document = f.read()
            document_cut = jieba.cut(document)
            sentence_segment=[]
            for word in document_cut:
                if word not in stopwords:
                    sentence_segment.append(word)
            result = ' '.join(sentence_segment)
            result = result.encode('utf-8')
            with open(segment_out_name, 'wb') as f2:
                f2.write(result)
                print(f"分词完成，保存为文件{'segment_{}.txt'.format(i)}.")

# 对source中的txt文件进行分词，输出到segment目录中
file_list=files_processing.get_files_list(source_folder, postfix='*.txt')

# 不使用停用词
segment_lines(file_list, segment_folder)

# 使用停用词（使用停用词后结果好像更差）
# segment_lines(file_list, segment_folder, stopwords=stopwords)

分词完成，保存为文件segment_0.txt.


## Word Embedding与相似度计算

### 模型训练

In [28]:
from gensim.models import word2vec
import multiprocessing

# 如果目录中有多个文件，可以使用PathLineSentences
segment_folder = './segment'
sentences = word2vec.PathLineSentences(segment_folder)

# 设置模型参数，进行训练
model = word2vec.Word2Vec(sentences, size=100, window=3, min_count=1)

### 得到与曹操最相近的10个词以及对应相似度

In [35]:
print([t[0] for t in model.wv.most_similar(positive='曹操', topn=10)])

['夫人', '关公', '孔明', '众将', '先主', '此事', '孙权', '后主', '韩福', '糜夫人']


### 曹操+刘备-张飞=?

In [36]:
model.wv.most_similar(positive=['曹操', '刘备'], negative=['张飞'], topn=1)

[('某', 0.9953306317329407)]

从上述结果来看，模型效果不太好。

### 参数调整

#### 参数size：词向量的维度

In [43]:
for s in range(25, 201, 25):
    print(f"size = {s}:")
    model = word2vec.Word2Vec(sentences, size=s, window=3, min_count=1, seed=0)
    print([t[0] for t in model.wv.most_similar(positive='曹操', topn=10)])
    print(model.wv.most_similar(positive=['曹操', '刘备'], negative=['张飞'], topn=3))
    print()

size = 25:
['孔明', '关公', '门吏', '先主', '大哭', '孙权', '又', '瑁', '众将', '司马懿']
[('丞相', 0.9954825043678284), ('朕', 0.9948809742927551), ('某', 0.9944384098052979)]

size = 50:
['先主', '关公', '进', '孔明', '孙权', '已', '司马懿', '老人', '吴押狱', '门吏']
[('谏', 0.9943273663520813), ('朕', 0.9935682415962219), ('大叫', 0.992993950843811)]

size = 75:
['关公', '孔明', '先主', '夫人', '孙权', '超', '众将', '泣', '司马懿', '孔明问']
[('主母', 0.9921486377716064), ('某', 0.9909951090812683), ('土人', 0.9903910160064697)]

size = 100:
['众将', '关公', '孔明', '司马懿', '惇', '请', '渊', '夫人', '大惊', '庞统']
[('臣', 0.9968320727348328), ('主公', 0.9955992698669434), ('丞相', 0.994544267654419)]

size = 125:
['关公', '孔明', '先主', '孙权', '禁', '回报', '逊谢', '实情', '生平', '云长']
[('泣', 0.9940541982650757), ('今', 0.9935337901115417), ('臣', 0.9933633804321289)]

size = 150:
['孙权', '后患', '关公', '辞回', '周瑜', '下令', '回报', '已', '夫人', '又']
[('杨松', 0.996979296207428), ('诸葛丞相', 0.9958659410476685), ('欲行', 0.994485080242157)]

size = 175:
['孔明', '孙权', '大惊', '老人', '他', '二人', '重用', '枭雄', '关公', 

#### window：Maximum distance between the current and predicted word within a sentence.

In [44]:
for w in range(1, 7):
    print(f"window = {w}:")
    model = word2vec.Word2Vec(sentences, size=125, window=w, min_count=1, seed=0)
    print([t[0] for t in model.wv.most_similar(positive='曹操', topn=10)])
    print(model.wv.most_similar(positive=['曹操', '刘备'], negative=['张飞'], topn=3))
    print()

window = 1:
['孔明', '张飞', '周瑜', '赵云', '孙权', '姜维', '吕布', '关公', '马超', '袁绍']
[('袁绍', 0.9919048547744751), ('周瑜', 0.9916937351226807), ('将军', 0.9872189164161682)]

window = 2:
['司马懿', '孔明', '周瑜', '孙权', '张飞', '关公', '夫人', '他', '众官', '大惊']
[('臣', 0.9932695627212524), ('今日', 0.9928516149520874), ('扬鞭', 0.9925713539123535)]

window = 3:
['众官', '孔明', '以破', '大惊', '张飞', '孙权', '超大', '相探', '王植', '二人']
[('先生', 0.9947894811630249), ('丞相', 0.994509220123291), ('之言', 0.9938519597053528)]

window = 4:
['孔明', '岱', '重用', '众将', '荆州', '邀', '关公', '先主', '孙权', '擅离']
[('祎', 0.9960540533065796), ('朕', 0.99573814868927), ('攸', 0.9955790042877197)]

window = 5:
['孔明问', '已', '那里', '孔明', '就此', '关公', '众', '下手', '众将', '策怒']
[('此', 0.997637927532196), ('大笑', 0.9973338842391968), ('如此', 0.996761679649353)]

window = 6:
['那里', '已', '誓同生死', '关公', '玄德再', '细奏', '超', '传令', '计策', '起身']
[('问', 0.9963052272796631), ('公言', 0.995137631893158), ('仰天', 0.9950897097587585)]



#### min_count：Ignores all words with total frequency lower than this.

In [46]:
for mc in range(1, 7):
    print(f"min_count = {mc}:")
    model = word2vec.Word2Vec(sentences, size=125, window=1, min_count=mc, seed=0)
    print([t[0] for t in model.wv.most_similar(positive='曹操', topn=10)])
    print(model.wv.most_similar(positive=['曹操', '刘备'], negative=['张飞'], topn=3))
    print()

min_count = 1:
['孔明', '张飞', '赵云', '孙权', '周瑜', '姜维', '关公', '吕布', '孟获', '马超']
[('袁绍', 0.992116391658783), ('周瑜', 0.9916278123855591), ('陈宫', 0.9881535172462463)]

min_count = 2:
['孔明', '张飞', '赵云', '玄德', '姜维', '吕布', '周瑜', '魏延', '关公', '孙权']
[('将军', 0.9678764343261719), ('袁绍', 0.9662936329841614), ('臣', 0.9654808640480042)]

min_count = 3:
['孔明', '张飞', '赵云', '马超', '孙权', '孟获', '关公', '周瑜', '吕布', '姜维']
[('此间', 0.9758447408676147), ('此', 0.9707177877426147), ('丞相', 0.9685412645339966)]

min_count = 4:
['孔明', '张飞', '赵云', '吕布', '周瑜', '关公', '孟获', '姜维', '马超', '魏延']
[('汝二人', 0.9631171822547913), ('司马懿', 0.9602318406105042), ('某愿', 0.955371618270874)]

min_count = 5:
['周瑜', '张飞', '吕布', '孔明', '赵云', '孙权', '马超', '关公', '孟获', '袁绍']
[('卿', 0.9624482989311218), ('此', 0.9620213508605957), ('但', 0.9618009328842163)]

min_count = 6:
['吕布', '周瑜', '赵云', '马超', '孙权', '张飞', '孔明', '袁绍', '孟获', '司马懿']
[('卿', 0.9499633312225342), ('起谢', 0.9448398351669312), ('此', 0.9405841827392578)]



#### 最后选择参数组为：size=125, window=1, min_count=2

In [47]:
model = word2vec.Word2Vec(sentences, size=125, window=1, min_count=2, seed=0)
print([t[0] for t in model.wv.most_similar(positive='曹操', topn=10)])
print(model.wv.most_similar(positive=['曹操', '刘备'], negative=['张飞'], topn=3))

['孔明', '张飞', '吕布', '玄德', '姜维', '赵云', '周瑜', '魏延', '关公', '孙权']
[('将军', 0.9688045978546143), ('陈宫', 0.9681386351585388), ('袁绍', 0.9658312797546387)]
