# Embedding Random

**摘要:**随机Embedding其实就是在Embedding层的参数进行随机初始化的过程,本次主要对随机Embedding的整个过程进行记录，以便以后有需要的时候可以很快的复现。

[参考源码地址--------](https://github.com/yongzhuo/Keras-TextClassification)

采用随机生成参数的形式进行Embedding操作，可在训练中更新参数以达到训练的目的，在word2vec对Embedd的初始化过程都是采用随机生成的，然后在训练中去不断更新参数，以达到训练的目的。

随机Embedding其实就是在Embedding层的参数进行随机初始化的过程

**注意:本次实例是字符级别的Embedding**

In [2]:
from keras.layers import Embedding 
from keras.models import Input, Model

import numpy as np  
import re

# 字符字典文件
path_embedding_term_char = "/Users/zhouwencheng/Desktop/Grass/data/model" \
                           "/ImportModel/Word2Vec/term_char.txt"

Using TensorFlow backend.


In [7]:
class RandomEmbedding(object):
    def __init__(self,
                 len_max=50,  # 文本最大长度, 建议25-50
                 embed_size=300,  # 嵌入层尺寸
                 vocab_size=30000,  # 字典大小, 这里随便填的，会根据代码里修改
                 trainable=True,  # 是否训练参数
                 path_char=path_char,
                ):
        self.len_max = len_max
        self.embed_size = embed_size
        self.vocab_size = vocab_size
        self.trainable = trainable
        self.path_char = path_char
        
        self.input = None
        self.output = None
        self.model = None
        self.token2idx = {}
        self.idx2token = {}
        
        # 定义符号
        self.ot_dict = {
            '[PAD]': 0,
            '[UNK]': 1,
            '[BOS]': 2,
            '[EOS]': 3, }
        self.deal_corpus()
        self.build()
        
    def deal_corpus(self):
        token2idx = self.ot_dict.copy()
        count = 3
        with open(file=self.path_char, mode='r', encoding='utf-8') as fd:
            while True:
                term_one = fd.readline()
                if not term_one:
                    break
                term_one = term_one.strip()
                if term_one not in token2idx:
                    count = count + 1
                    token2idx[term_one] = count
        self.token2idx = token2idx
        self.idx2token = {}
        for key, value in self.token2idx.items():
            self.idx2token[value] = key
    
    def build(self, **kwargs):
        self.vocab_size = len(self.token2idx)
        self.input = Input(shape=(self.len_max, ), dtype='int32')
        self.output = Embedding(input_dim=self.vocab_size,
                                output_dim=self.embed_size,
                                input_length=self.len_max,
                                trainable=self.trainable,
                                )(self.input)
        self.model = Model(inputs=self.input, outputs=self.output)
    
    def sentence2idx(self, text):
        text = self.extract_chinese(str(text)).upper()
        text = list(text)
        text = [text_one for text_one in text]
        len_leave = self.len_max - len(text)

        # 转换和填充处理
        if len_leave >= 0:
            text_index = [self.token2idx[text_char] if text_char in self.token2idx else self.token2idx['[UNK]'] for text_char in text] + [self.token2idx['[PAD]'] for i in range(len_leave)]
        else:
            text_index = [self.token2idx[text_char] if text_char in self.token2idx else self.token2idx['[UNK]'] for
                          text_char in text[0:self.len_max]]
        return text_index
    
    def idx2sentence(self, idx):
        assert type(idx) == list
        text_idx = [self.idx2token[id] if id in self.idx2token else self.idx2token['[UNK]'] for id in idx]
        return "".join(text_idx)
    
    def extract_chinese(self, text):
        """
              只提取出中文、字母和数字
            :param text: str, input of sentence
            :return:
            """
        chinese_exttract = ''.join(re.findall(u"([\u4e00-\u9fa5A-Za-z0-9@._])", text))
        return chinese_exttract   

In [8]:
# 测试

texts = ["今天天气不错",
                 "明天天气也不错"]
eb = RandomEmbedding()
x = []
for t in texts:
    x.append(eb.sentence2idx(t))
x = np.array(x)
print(x.shape)
print(x)

model = eb.model
p = model.predict(x)
print(p.shape)
print(p)
print(p.shape)

(2, 50)
[[ 527  140  140  455   62 1429    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]
 [ 248  140  140  455  170   62 1429    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]
(2, 50, 300)
[[[ 0.04584242  0.03654362 -0.01917813 ... -0.01199863 -0.01021879
    0.03497038]
  [ 0.02573839 -0.00362322  0.02442862 ... -0.01341845  0.0128829
    0.03542539]
  [ 0.02573839 -0.00362322  0.02442862 ... -0.01341845  0.0128829
    0.03542539]
  ...
  [ 0.00539257  0.02650164 -0.003124   ...  0.03868828 -0.03746802
   -0.0148697 ]
  [ 0.00539257  0.02650164 -0.003124   ...  0.03868828 -0.03746802
   -0.0148697 ]
  [ 0.00539257  0.02650164 -0.003124   ..