# Word2Vec文本处理

In [199]:
from gensim.models import word2vec  #自然语言处理
import logging  #格式美化
import pandas as pd 
import numpy as np
import jieba
import re, string
from zhon.hanzi import punctuation #去除中文中的所有标点符号
logging.basicConfig(format='%(asctime)s : %(levelname)s:%(message)s',level=logging.INFO)

In [200]:
#构造文本
raw_sentences = ['the quick brown fox jumps over the lazy dogs','yoyoyo you go home now to sleep']

In [201]:
#拆分单词
sentences = [s.split() for s in raw_sentences]
print(sentences)

[['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dogs'], ['yoyoyo', 'you', 'go', 'home', 'now', 'to', 'sleep']]


In [202]:
#实例化word2vec模块
'''
    min_count=1:过滤掉只出现过1次的词
    size:神经网络的层数，默认是100
    workers:用于设置并发训练时候的线程数，仅当Cython安装的情况下才会起作用
'''
model = word2vec.Word2Vec(sentences, min_count=1)

2021-01-25 00:40:30,903 : INFO:collecting all words and their counts
2021-01-25 00:40:30,906 : INFO:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-01-25 00:40:30,908 : INFO:collected 15 word types from a corpus of 16 raw words and 2 sentences
2021-01-25 00:40:30,911 : INFO:Loading a fresh vocabulary
2021-01-25 00:40:30,914 : INFO:effective_min_count=1 retains 15 unique words (100% of original 15, drops 0)
2021-01-25 00:40:30,919 : INFO:effective_min_count=1 leaves 16 word corpus (100% of original 16, drops 0)
2021-01-25 00:40:30,922 : INFO:deleting the raw counts dictionary of 15 items
2021-01-25 00:40:30,926 : INFO:sample=0.001 downsamples 15 most-common words
2021-01-25 00:40:30,928 : INFO:downsampling leaves estimated 2 word corpus (13.7% of prior 16)
2021-01-25 00:40:30,930 : INFO:estimated required memory for 15 words and 100 dimensions: 19500 bytes
2021-01-25 00:40:30,931 : INFO:resetting layer weights
2021-01-25 00:40:30,944 : INFO:training model with 3 w

In [203]:
#查看单词之间的相似度
model.similarity('dogs','you')

0.07596941

维基百科数据下载
https://dumps.wikimedia.org/zhwiki/

In [204]:
#是用opencc-python来转换繁体字
'''
t2s - 繁体转简体
s2t - 简体转繁体
mix2t - 混合体转繁体
mix2s - 混合体转简体
'''
#import opencc
#cc = opencc.OpenCC('t2s')
#print cc.convert(u'Open Chinese Convert（OpenCC）「開放中文轉換」')

'\nt2s - 繁体转简体\ns2t - 简体转繁体\nmix2t - 混合体转繁体\nmix2s - 混合体转简体\n'

In [205]:
df = pd.read_csv(r'./平凡的世界.txt')

In [206]:
#构造分词的方法
f=open('./平凡的世界.txt',encoding='utf8')
data = f.read()
f.close()

In [207]:
#清除所有的标点符号
data_clean = re.sub(r'[%s]+' %punctuation, '',data)

In [208]:
#去掉所有的空格
data_clean_ = data_clean.replace(" ","")
data_clean_ = data_clean_.replace("\n","")
data_clean_ = data_clean_.replace("---","")


In [210]:
#结巴分词
text = jieba.cut(data_clean_, cut_all=False)
new_test = " ".join(text)

In [218]:
#保存分好的词
testdate=open('./test.txt','w',encoding='utf-8')
testdate.write(new_test)

1140534

In [219]:
#用lineSentence读取出分好的词，才能顺利构建模型
sentences_ = word2vec.LineSentence('./test.txt')

In [221]:
#建立word2vec模型
model_ = word2vec.Word2Vec(sentences_, min_count=1)

2021-01-25 00:53:25,147 : INFO:collecting all words and their counts
2021-01-25 00:53:25,242 : INFO:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-01-25 00:53:25,432 : INFO:collected 35431 word types from a corpus of 428890 raw words and 43 sentences
2021-01-25 00:53:25,433 : INFO:Loading a fresh vocabulary
2021-01-25 00:53:25,536 : INFO:effective_min_count=1 retains 35431 unique words (100% of original 35431, drops 0)
2021-01-25 00:53:25,537 : INFO:effective_min_count=1 leaves 428890 word corpus (100% of original 428890, drops 0)
2021-01-25 00:53:25,734 : INFO:deleting the raw counts dictionary of 35431 items
2021-01-25 00:53:25,736 : INFO:sample=0.001 downsamples 39 most-common words
2021-01-25 00:53:25,739 : INFO:downsampling leaves estimated 352335 word corpus (82.2% of prior 428890)
2021-01-25 00:53:25,890 : INFO:estimated required memory for 35431 words and 100 dimensions: 46060300 bytes
2021-01-25 00:53:25,891 : INFO:resetting layer weights
2021-01-25 00:

In [227]:
#用训练好的模型进行预测
testwords = ['美好','平凡','女人']
for words in testwords:
    res = model_.most_similar(words)
    print(words)
    print(res)

美好
[('生命', 0.9991242289543152), ('精神', 0.9986003041267395), ('艰难', 0.9983970522880554), ('现实', 0.998322069644928), ('复杂', 0.9980694055557251), ('情感', 0.9979764819145203), ('欢乐', 0.9977883100509644), ('社会', 0.9974801540374756), ('矛盾', 0.9974163770675659), ('所', 0.9973421096801758)]
平凡
[('所有', 0.9970272779464722), ('经历', 0.9962289333343506), ('感情', 0.9948828816413879), ('生命', 0.9933726191520691), ('现实', 0.9933608770370483), ('梦幻', 0.9932478070259094), ('却是', 0.9929790496826172), ('普通人', 0.9923515319824219), ('男生', 0.9923454523086548), ('每', 0.9919865131378174)]
女人
[('这是', 0.9984006881713867), ('还有', 0.9983314275741577), ('教师', 0.997976541519165), ('维持', 0.9978894591331482), ('高朗', 0.9978026151657104), ('满足', 0.997765302658081), ('命运', 0.997738242149353), ('人来', 0.9976729154586792), ('活人', 0.9974938631057739), ('东西', 0.997492790222168)]
