# Embedding Word2Vec

**[摘要] word2vec是NLP文字转向量很重要的组成，此次采用 gensim 加载训练好的vec变成参数加载到Embedding的初始化参数中（vec向量是由维基百科训练得到的向量，具体没有过多介绍---）。这边便能达到embeddin操作的目的，同时加载到Embedding的参数还可以选择再训练**

[参考源码地址==========](https://github.com/yongzhuo/Keras-TextClassification)


word2vec是在NLP向量化非常普遍的形式，由于计算资源等因素能采用别人训练好的向量既可以达到向量化的目的，又可以不用再花太多时间这可以说是一举两得的！如果不是领域行非常强的话可以做一些微调即可很好的使用，如果领域性很强则需要考虑是否需要自己重先训练。

这里记录下今天（2019-11-27）探索Word2Vec的实现代码，以便于以后有需要的时候能快速实现。

In [2]:
from keras.layers import Add, Embedding
from gensim.models import KeyedVectors
from keras.models import Input, Model

import numpy as np
import codecs
import os
import re

# 通过维基百科训练出来的词向量，维度为300W（字符级）
path_embedding_w2v_wiki = "/Users/zhouwencheng/Desktop/Grass/data/model" \
                          "/ImportModel/Word2Vec/w2v_model_wiki_char.vec"
path_embedding_w2v_wiki

'/Users/zhouwencheng/Desktop/Grass/data/model/ImportModel/Word2Vec/w2v_model_wiki_char.vec'

In [5]:
class WordEmbedding(object):
    def __init__(self,
                 len_max=50,  # 文本最大长度, 建议25-50
                 embed_size=300,  # 嵌入层尺寸
                 vocab_size=30000,  # 字典大小, 这里随便填的，会根据代码里修改
                 trainable=True,  # 是否训练参数
                 path_vec=path_embedding_w2v_wiki,
                ):
        self.len_max = len_max
        self.embed_size = embed_size
        self.vocab_size = vocab_size
        self.trainable = trainable
        self.path_vec = path_vec
        
        self.input = None
        self.output = None
        self.model = None
        self.token2idx = {}
        self.idx2token = {}
        
        # 定义符号
        self.ot_dict = {
            '[PAD]': 0,
            '[UNK]': 1,
            '[BOS]': 2,
            '[EOS]': 3, }
        self.deal_corpus()
        self.build()
        
    def deal_corpus(self): 
        pass
    
    def build(self, **kwargs):
        print(f"load word2vec start!")
        self.key_vector = KeyedVectors.load_word2vec_format(self.path_vec, **kwargs)
        print(f"load word2vec end!")
        self.embed_size = self.key_vector.vector_size
        self.token2idx = self.ot_dict.copy()
        embedding_matrix = []
        # 首先加self.token2idx中的四个[PAD]、[UNK]、[BOS]、[EOS]
        embedding_matrix.append(np.zeros(self.embed_size))
        embedding_matrix.append(np.random.uniform(-0.5, 0.5, self.embed_size))
        embedding_matrix.append(np.random.uniform(-0.5, 0.5, self.embed_size))
        embedding_matrix.append(np.random.uniform(-0.5, 0.5, self.embed_size))
        for word in self.key_vector.index2entity:
            self.token2idx[word] = len(self.token2idx)
            embedding_matrix.append(self.key_vector[word])

        self.idx2token = {}
        for key, value in self.token2idx.items():
            self.idx2token[value] = key

        self.vocab_size = len(self.token2idx)
        embedding_matrix = np.array(embedding_matrix)
        self.input = Input(shape=(self.len_max, ), dtype='int32')
        self.output = Embedding(
            input_dim=self.vocab_size,
            output_dim=self.embed_size,
            input_length=self.len_max,
            weights=[embedding_matrix],
            trainable=self.trainable
        )(self.input)
        self.model = Model(inputs=self.input, outputs=self.output)
    
    def sentence2idx(self, text):
        text = self.extract_chinese(str(text)).upper()
        text = list(text)
        text = [text_one for text_one in text]
        len_leave = self.len_max - len(text)

        # 转换和填充处理
        if len_leave >= 0:
            text_index = [self.token2idx[text_char] if text_char in self.token2idx else self.token2idx['[UNK]'] for text_char in text] + [self.token2idx['[PAD]'] for i in range(len_leave)]
        else:
            text_index = [self.token2idx[text_char] if text_char in self.token2idx else self.token2idx['[UNK]'] for
                          text_char in text[0:self.len_max]]
        return text_index
    
    def idx2sentence(self, idx):
        assert type(idx) == list
        text_idx = [self.idx2token[id] if id in self.idx2token else self.idx2token['[UNK]'] for id in idx]
        return "".join(text_idx)
    
    def extract_chinese(self, text):
        """
              只提取出中文、字母和数字
            :param text: str, input of sentence
            :return:
            """
        chinese_exttract = ''.join(re.findall(u"([\u4e00-\u9fa5A-Za-z0-9@._])", text))
        return chinese_exttract   

In [6]:
texts = ["今天天气不错",
             "明天天气也不错"]
eb = WordEmbedding()
x = []
for t in texts:
    x.append(eb.sentence2idx(t))
x = np.array(x)
print(x.shape)

model = eb.model
p = model.predict(x)
print(p.shape)
print(p)
print(p.shape)

load word2vec start!
load word2vec end!





(2, 50)
(2, 50, 300)
[[[-1.2650331   3.1101494  -2.2554128  ...  1.818751    3.429298
   -2.7108421 ]
  [ 0.21647607 -3.4711666  -1.4919875  ...  4.7559776   0.2984004
    0.40304002]
  [ 0.21647607 -3.4711666  -1.4919875  ...  4.7559776   0.2984004
    0.40304002]
  ...
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]]

 [[ 0.20198439  0.06784999 -1.4983975  ...  0.9137133   2.7521787
   -0.21572655]
  [ 0.21647607 -3.4711666  -1.4919875  ...  4.7559776   0.2984004
    0.40304002]
  [ 0.21647607 -3.4711666  -1.4919875  ...  4.7559776   0.2984004
    0.40304002]
  ...
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.        