# 训练和加载Embedding Layer
主要是利用FastText的skip-gram来在大数据集上训练word_vec矩阵，并将训练好的矩阵加载到keras的embedding layer中

In [1]:
import pandas as pd
import numpy as np
import jieba
from keras.layers.embeddings import Embedding

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


## 读取vec.txt文件，转化为keras的Embedding layer

In [3]:
'''这里的输入文件是fastText输出的model.vec文件转的.txt文件'''
def get_vec(file_path):
    word_to_index = {}
    index_to_word = {}
    word_to_vec = {}
    with open(file_path) as file:
        count = 1
        while 1:
            line = file.readline()
            line = line.replace(' \n','')
            if not line:
                break
            nums = line.split(' ')
            word_to_vec[nums[0]]=np.array(nums[1:],dtype=float)
            word_to_index[nums[0]]=count
            index_to_word[count]=nums[0]
            count+=1
    return word_to_index,index_to_word,word_to_vec

In [4]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):  
    vocab_len = len(word_to_index) + 1
    emb_dim = word_to_vec_map["</s>"].shape[0]
    emb_matrix = np.zeros((vocab_len,emb_dim))
    
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    embedding_layer = Embedding(vocab_len,emb_dim,trainable=False)

    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

## 准备fastText的input文件
该input文件格式为：每行都是评论内容的txt文件

In [None]:
import re
jieba.load_userdict('user_dict.txt')
def get_content(comment_df):
    result = []
    for each in comment_df['content']:
        each = str(each)
        each_ = each.replace('团购点评','').replace('\n','').replace('&nbsp','').replace('n','').replace('#','')
        result.append(each_)
    return pd.Series(result)

In [None]:
def token_content(comment_series):
    contents = []
    for each in comment_series:
        try:
            words = jieba.lcut(each)
            contents.append(' '.join(words))
        except Exception:
            print(each)
    return contents

In [None]:
import os
file_list = os.listdir(os.getcwd()+'/外卖')[1:]
file_list

In [None]:
comment_list = []
for file in file_list:
    if file=='评论.csv':
        comment = pd.read_csv('外卖/评论.csv')
    elif file.split('.')[1]=='csv':
        comment = pd.read_csv('外卖/{}'.format(file),encoding='gb18030')
    else:
        comment = pd.read_excel('外卖/{}'.format(file))
    comment_s = get_content(comment)
    comment_list = comment_list + token_content(comment_s)

In [None]:
comment_series = pd.Series(comment_list)
print('Embedding matrix 训练了{}条评论'.format(len(comment_list)))

### 将评论数据写入文件作为fastText的输入

In [None]:
with open('my_corp.csv','w') as file:
    comment_series.to_csv(file,index=False)

## 用生成的vec文件来构建Rnn
fastText会生成一个word对应vec的.vec文件，将这个文件改为.txt文件后（如下文的comment_model.txt）读取进来，利用上面定义好的工具函数建立模型

In [5]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.models import load_model
from sklearn.utils import shuffle

In [6]:
word_to_index,index_to_word,word_to_vec = get_vec('comment_model.txt')

In [7]:
my_embedding_layer = pretrained_embedding_layer(word_to_vec, word_to_index)

In [8]:
model = Sequential()
model.add(my_embedding_layer)
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [9]:
data = pd.read_csv('training_data/train_1.7W.csv')
data = data.fillna(0)
data['is_pos'] = data['is_pos'].replace(-1,0)
for i in range(len(data)):
    data.iloc[i,0] = str(data.iloc[i,0])
    data.iloc[i,0] = data.iloc[i,0].replace('团购点评','').replace('\n','').replace('&nbsp','').replace('n','').replace('#','')

with open('train_1.7W_clean.csv','w') as file:
    data.to_csv(file)

In [10]:
def text_to_ints(text,word_to_index):
    words = jieba.lcut(text)
    word_in_use =[]
    for word in words:
        if word in word_to_index.keys():
            word_in_use.append(word)
    text_ints = [word_to_index[word] for word in word_in_use]
    text_ints = np.array(text_ints)
    return text_ints

def trunc_seq(seq,seq_len=15):
    features = np.zeros((len(seq), seq_len), dtype=int)
    for i, row in enumerate(seq):
        features[i, -len(row):] = np.array(row)[:seq_len]
    return features

def get_train_set(comment_df,word_to_index,seq_len=100):
    features = []
    features_len = []
    for each in comment_df.iloc[:,0]:
        try:
            text_ints = text_to_ints(each,word_to_index)
            features.append(text_ints)
            features_len.append(len(text_ints))
        except Exception:
            print(each)

    print('max comment length:',max(features_len))
    labels = np.array(comment_df.iloc[:,1:])
    train_X = [each for each in features if each.shape[0]>0 ]
    train_y = [labels[i,:] for i in range(len(features)) if features[i].shape[0]>0]
    train_y = np.array(train_y)
    train_X = trunc_seq(train_X,seq_len)
    print('train set prepared!')
    return train_X,train_y,features_len

In [13]:
comment_df = data[['Content','is_pos']]

In [16]:
X,y,features_len = get_train_set(comment_df,word_to_index,seq_len=100)
X,y = shuffle(X,y)

train_X = X[:16045,:]
train_y = y[:16045]

test_X = X[16045:,]
test_y = y[16045:,]

print('train_X shape:{}, train_y shape:{}, test_X shape {},test_y shape {}'.format(train_X.shape,train_y.shape,test_X.shape,test_y.shape))

max comment length: 791
train set prepared!
train_X shape:(16045, 100), train_y shape:(16045, 1), test_X shape (1000, 100),test_y shape (1000, 1)


In [17]:
model.fit(train_X, train_y,
          batch_size=100,
          epochs=20,
          validation_data=(test_X, test_y))
score, acc = model.evaluate(test_X, test_y,batch_size=100)
print('Test score:', score)
print('Test accuracy:', acc)

Train on 16045 samples, validate on 1000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test score: 0.2034496024250984
Test accuracy: 0.91700000166893


In [18]:
model.save('models/is_pos1.7.h5')

In [38]:
comment_df = data[['Content','is_env']]

In [39]:
X,y,features_len = get_train_set(comment_df,word_to_index,seq_len=100)
X,y = shuffle(X,y)

train_X = X[:16045,:]
train_y = y[:16045]

test_X = X[16045:,]
test_y = y[16045:,]

print('train_X shape:{}, train_y shape:{}, test_X shape {},test_y shape {}'.format(train_X.shape,train_y.shape,test_X.shape,test_y.shape))

max comment length: 791
train set prepared!
train_X shape:(16045, 100), train_y shape:(16045, 1), test_X shape (1000, 100),test_y shape (1000, 1)


In [40]:
serv_model = Sequential()
serv_model.add(my_embedding_layer)
serv_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
serv_model.add(Dense(1, activation='tanh'))
serv_model.compile(loss='logcosh',
                  optimizer='adam',
                  metrics=['accuracy'])

In [42]:
serv_model.fit(train_X, train_y,
          batch_size=100,
          epochs=5,
          validation_data=(test_X, test_y))
score, acc = serv_model.evaluate(test_X, test_y,batch_size=100)
print('Test score:', score)
print('Test accuracy:', acc)

Train on 16045 samples, validate on 1000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 0.038074220158159734
Test accuracy: 0.9120000004768372


In [43]:
serv_model.save('models/is_env1.7.h5')