# 训练词向量
## 任务介绍
* 原始数据预处理,生成标准的train、test数据
* 生成词典vocab.txt
* word2vec模型训练

## 目录介绍
* datasets目录：存放train、test数据的csv格式

In [49]:
import numpy as np
import pandas as pd
import jieba
from jieba import posseg
from collections import defaultdict
import pickle
import os
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models.keyedvectors import KeyedVectors

### 1、数据预处理

In [8]:
#切词
def segment(sentence,cut=jieba.lcut,cut_type="word",pos=False):
    """
    切词
    :param sentence:
    :param cut:切词方法
    :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence)
    :param pos: 词性切词
    :return: list
    """
    if pos:
        if cut_type=="word":   #中文
            word_pos_seq=posseg.lcut(sentence)
            word_seq=[w for w,_ in word_pos_seq]
            pos_seq=[p for _,p in word_pos_seq]
            return word_seq,pos_seq
        else:                #英文
            word_seq=list(sentence)
            pos_seq=[posseg.lcut(w)[0].flag for w in word_seq]
            return word_seq,pos_seq
    else:
        if cut_type=="word":
            return cut(sentence)
        else:
            return list(sentenceS)

REMOVE_WORDS = ['|', '[', ']', '语音', '图片', ' ']

#去除停用词
def remove_words(words_list):
    return [word for word in words_list if word not in REMOVE_WORDS]

#切词并去除停用词
def preprocess_sentence(sentence):
    sen_list=segment(sentence,cut_type="word")
    sen_list=remove_words(sen_list)
    return " ".join(sen_list)

#数据预处理
def parse_data(train_path,test_path):
    def handle(path,train=True):
        data_df=pd.read_csv(path,encoding="utf-8")
        if train:
            data_df.dropna(subset=['Report'],how='any',inplace=True)  #label去除有NAN的
        data_df.fillna('',inplace=True)  #nan用‘’替换
        x=data_df.Question.str.cat(data_df.Dialogue) #对话拼接
        x=x.apply(preprocess_sentence)   #分词
        if train:
            y=data_df.Report.apply(preprocess_sentence)  #label
            print('train_x is ', len(x))
            print('train_y is ', len(y))
            x.to_csv("datasets/train_seg_x.txt",index=None,header=False)
            y.to_csv("datasets/train_seg_y.txt",index=None,header=False)
        else:
            print('test_x is ', len(x))
            x.to_csv("datasets/test_seg_x.txt",index=None,header=False)
    handle(train_path)
    handle(test_path,train=False)
    

In [10]:
train_path="datasets/AutoMaster_TrainSet.csv"
test_path="datasets/AutoMaster_TestSet.csv"
parse_data(train_path,test_path)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\dell\AppData\Local\Temp\jieba.cache
Loading model cost 0.706 seconds.
Prefix dict has been built succesfully.


train_x is  82873
train_y is  82873
test_x is  20000


### 2、生成词典

In [35]:
#保存词典
def save_vocab(vocab,path):
    with open(path,"w",encoding="utf-8") as f:
        for wv in vocab:
            w,v=wv
            f.write("%s\t%s\n"%(w,v))
            
#读取数据
def read_data(path_1, path_2, path_3):
    with open(path_1, 'r', encoding='utf-8') as f1, \
            open(path_2, 'r', encoding='utf-8') as f2, \
            open(path_3, 'r', encoding='utf-8') as f3:
        words = []
        sentences=[]
        # print(f1)
        for line in f1:
            words += line.split()
            sentences.append(line.strip())
        for line in f2:
            words += line.split(' ')
            sentences.append(line.strip())
        for line in f3:
            words += line.split(' ')
            sentences.append(line.strip())
        print(len(words))
    return words,sentences

#构建词典
def build_vocab(items, sort=True, min_count=0, lower=False):
    """
    构建词典列表
    :param items: list  [item1, item2, ... ]
    :param sort: 是否按频率排序，否则按items排序
    :param min_count: 词典最小频次
    :param lower: 是否小写
    :return: list: word set
    """
    result = []
    if sort:
        # sort by count
        dic = defaultdict(int)
        #统计词频
        for item in items:
            for i in item.split(" "):
                i = i.strip()
                if not i: continue
                i = i if not lower else item.lower()
                dic[i] += 1
        # 排序
        dic=sorted(dic.items(),key=lambda x:x[1],reverse=True)
        for i, item in enumerate(dic):
            key = item[0]
            if min_count and min_count > item[1]:
                continue
            result.append(key)
    else:
        # sort by items
        for i, item in enumerate(items):
            item = item if not lower else item.lower()
            result.append(item)
    vocab=[(v,i) for i,v in enumerate(result)]
    reverse_vocab=[(i,v) for v,i in vocab]

    return vocab, reverse_vocab

#存词库
def save_sentences(sentences,path):
    with open(path,"w",encoding="utf-8") as f:
        for line in sentences:
            f.write("%s\n"%line.strip())
    print("save sentences path:%s"%path)

def main_():
    lines,sentences = read_data('datasets/train_seg_x.txt',
                      'datasets/train_seg_y.txt',
                      'datasets/test_seg_x.txt')
    vocab, reverse_vocab = build_vocab(lines)
    save_vocab(vocab,"datasets/vocab.txt")
    save_sentences(sentences,"datasets/sentences.txt")

In [36]:
main_()

20446700
save sentences path:datasets/sentences.txt


### 3、词向量训练

In [50]:
#存pickle
def dump_pkl(vocab,p_path,overwrite=True):
    if p_path and os.path.exists(p_path) and not overwrite:
        return
    if p_path:
        with open(p_path,"wb") as f:
            pickle.dump(vocab,f,protocol=pickle.HIGHEST_PROTOCOL)
        print("save pkl:%s"%p_path)

#加载pickle
def load_pkl(path):
    with open(path,"rb") as f:
        result=pickle.load(f)
    return result

def build_w2v(out_path, sentence_path,w2v_bin_path="w2v.bin"):
    '''
    :param out_path: word2vec.txt
    :param sentence_path: sentences.txt
    :param w2v_bin_path: 模型路径
    '''

    if w2v_bin_path and os.path.exists(w2v_bin_path):
        model=KeyedVectors.load_word2vec_format(w2v_bin_path,binary=True)
    else:
        sentences=LineSentence(sentence_path)
        w2v=Word2Vec(sentences=sentences,size=256,window=5,iter=10,sg=1)
        w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
        model=w2v.wv
    sim = w2v.wv.similarity('技师', '车主')
    print('技师 vs 车主 similarity score:', sim)
    word_dict={}
    for word in model.vocab:
        word_dict[word]=model[word]
    dump_pkl(word_dict,out_path)
    

In [None]:
build_w2v(out_path="datasets/word2vec.txt",sentence_path="datasets/sentences.txt")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
