In [1]:
%config ZMQInteractiveShell.ast_node_interactivity='all'
import pandas as pd
from config import *


from gensim.models import Word2Vec
import multiprocessing
import jieba
from collections import Counter
import json
import numpy as np

## 加载源数据

In [2]:
reviews = []
labels = []
with open(dataSource,'r',encoding='utf-8') as file:
    for line in file:
        temp = line.replace('\n', '').split(',,')
        reviews.append(temp[0])
        labels.append(temp[1])
print('data:',len(reviews),len(labels))
reviews[:2]
labels[:2]

data: 101058 101058


['才用就发现相机打开迟钝，半天反应不过来，有时候还会卡出去，他们又不给解决方案。', '还没穿二天就起毛了']

['0', '0']

In [3]:
reviews = [jieba.lcut(review.replace('\n', '')) for review in reviews]
print('data:',len(reviews),len(labels))
reviews[:2]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zhoubin\AppData\Local\Temp\jieba.cache
Loading model cost 0.903 seconds.
Prefix dict has been built succesfully.


data: 101058 101058


[['才',
  '用',
  '就',
  '发现',
  '相机',
  '打开',
  '迟钝',
  '，',
  '半天',
  '反应',
  '不',
  '过来',
  '，',
  '有时候',
  '还会卡',
  '出去',
  '，',
  '他们',
  '又',
  '不',
  '给',
  '解决方案',
  '。'],
 ['还', '没', '穿', '二天', '就', '起毛', '了']]

## 预训练词向量

In [4]:
# w2v_model = Word2Vec(reviews,size=embeddingSize,
#                      min_count=miniFreq,
#                      window=10,
#                      workers=multiprocessing.cpu_count(),sg=1,
#                      iter=20)

# w2v_model = Word2Vec(reviews,size=embeddingSize,
#                      min_count=miniFreq,
#                      window=10,
#                      workers=10,sg=1,
#                      iter=20)
# w2v_model.save(w2v_model_path)

# model = Word2Vec.load(w2v_model_path)
# model.wv.vocab.keys()

## 加载停用词

In [5]:
def readStopWord(stopWordPath):
    stopWordDict = {}
    with open(stopWordPath, "r", encoding='utf-8') as f:
        stopWords = f.read()
        stopWordList = stopWords.splitlines()
        # 将停用词用列表的形式生成，之后查找停用词时会比较快
        stopWordDict = dict(zip(stopWordList, list(range(len(stopWordList)))))
    return stopWordDict

stopWordDict = readStopWord(stopWordPath)
# stopWordDict

## 构建词典

In [6]:
def getWordEmbedding(words):
    """
    按照我们的数据集中的单词取出预训练好的word2vec中的词向量
    """

    # 中文
    model = Word2Vec.load(w2v_model_path)

    vocab = []
    wordEmbedding = []

    # 添加 "pad" 和 "UNK", 
    vocab.append("pad")
    wordEmbedding.append(np.zeros(embeddingSize))

    vocab.append("UNK")
    wordEmbedding.append(np.random.randn(embeddingSize))

    for word in words:
        try:
            vector = model[word]
            vocab.append(word)
            wordEmbedding.append(vector)

        except:
            print(word + "不存在于词向量中")

    return vocab, np.array(wordEmbedding)


def genVocabulary():
    """
    生成词向量和词汇-索引映射字典，可以用全数据集
    """

    allWords = [word for review in reviews for word in review]

    # 去掉停用词
    subWords = [word for word in allWords if word not in stopWordDict]

    # 统计词频，排序
    wordCount = Counter(subWords)
    sortWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)

    # 去除低频词
    words = [item[0] for item in sortWordCount if item[1] >= miniFreq]

    # 获取词列表和顺序对应的预训练权重矩阵
    vocab, wordEmbedding = getWordEmbedding(words)
    

    wordToIndex = dict(zip(vocab, list(range(len(vocab)))))
    indexToWord = dict(zip(list(range(len(vocab))), vocab))
    n_symbols = len(wordToIndex) + 1

    # 将词汇-索引映射表保存为json数据，之后做inference时直接加载来处理数据
    with open("./temp/wordJson/wordToIndex.json", "w", encoding="utf-8") as f:
        json.dump(wordToIndex, f)

    with open("./temp/wordJson/indexToWord.json", "w", encoding="utf-8") as f:
        json.dump(indexToWord, f)
    
    return wordToIndex, indexToWord, n_symbols

wordToIndex, indexToWord, n_symbols = genVocabulary()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## 构建数据集

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


def reviewProcess(review, sequenceLength, wordToIndex):
    """
    将数据集中的每条评论里面的词，根据词表，映射为index表示
    每条评论 用index组成的定长数组来表示

    """

    reviewVec = np.zeros((sequenceLength))
    
    sequenceLen = sequenceLength

    # 判断当前的序列是否小于定义的固定序列长度
    if len(review) < sequenceLength:
        sequenceLen = len(review)

    for i in range(sequenceLen):
        if review[i] in wordToIndex:
            reviewVec[i] = wordToIndex[review[i]]
        else:
            reviewVec[i] = wordToIndex["UNK"]

    return reviewVec



def genTrainEvalData(x, y, rate):
    """
    生成训练集和验证集
    """

    reviews = []
    labels = []

    # 遍历所有的文本，将文本中的词转换成index表示
    for i in range(len(x)):
        reviewVec = reviewProcess(x[i], sequenceLength, wordToIndex)
        reviews.append(reviewVec)
        labels.append([y[i]])

    trainIndex = int(len(x) * rate)

#     trainReviews = pad_sequences(reviews[:trainIndex], maxlen=sequenceLength)
    trainReviews = np.asarray(reviews[:trainIndex], dtype="int64")
    trainLabels = np.array(labels[:trainIndex], dtype="float32")
    
    trainLabels = to_categorical(trainLabels,num_classes=2)

#     evalReviews = pad_sequences(reviews[trainIndex:], maxlen=sequenceLength)
    evalReviews = np.asarray(reviews[trainIndex:], dtype="int64")
    evalLabels = np.array(labels[trainIndex:], dtype="float32")
    
    evalLabels = to_categorical(evalLabels,num_classes=2)

    return trainReviews, trainLabels, evalReviews, evalLabels


trainReviews, trainLabels, evalReviews, evalLabels = genTrainEvalData(reviews, labels, rate=rate)
print("train data shape: {}".format(trainReviews.shape))
print("train label shape: {}".format(trainLabels.shape))
print("eval data shape: {}".format(evalReviews.shape))
trainReviews[:2]
trainLabels[:2]

train data shape: (80846, 200)
train label shape: (80846, 2)
eval data shape: (20212, 200)


array([[   1,    1,    1,   71,  340,   79, 1923,    1,  274,    1,    1,
           1,    1,  260, 6621,    1,    1,    1,    1,    1,    1, 2039,
           1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

array([[1., 0.],
       [1., 0.]], dtype=float32)

## 模型构建

In [8]:
from tensorflow.python.keras import Input, Model
from tensorflow.python.keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout


class TextCNN(object):
    def __init__(self, maxlen, max_features, embedding_dims, class_num=2, last_activation='sigmoid'):
        self.maxlen = maxlen  # 最大序列的长度（句子的长度）
        self.max_features = max_features  # 词表的大小，最多容纳多少个词
        self.embedding_dims = embedding_dims  # 词向量的维度
        self.class_num = class_num  # 类别数
        self.last_activation = last_activation  # 最后的激活函数

    def get_model(self):
        input = Input((self.maxlen,))  # 表示输入是maxlen维的向量
        # input_dim: 词汇表大小  output_dim：词向量的维度  input_length: 输入序列的长度
        embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)(input)
        convs = []
        for kernel_size in filterSizes:
            c = Conv1D(numFilters, kernel_size, activation='relu')(embedding)
            c = GlobalMaxPooling1D()(c)
            convs.append(c)
        x = Concatenate()(convs)
        x = Dropout(dropoutKeepProb)(x)

        output = Dense(self.class_num, activation=self.last_activation)(x)
        model = Model(inputs=input, outputs=output)
        return model

## 模型训练

In [9]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
import yaml

x_train = trainReviews
y_train = trainLabels
x_eval = evalReviews
y_eval = evalLabels

print('构建模型...')
model = TextCNN(sequenceLength, n_symbols, embeddingSize).get_model()
model.summary()
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

# 回调函数
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=10, mode='auto')
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_checkpoint = ModelCheckpoint('./model/cnn_model/model_{epoch:02d}-{val_accuracy:.2f}.hdf5',
                                   save_best_only=True, save_weights_only=True)


history = model.fit(x_train, y_train, batch_size=batchSize, epochs=epochs, validation_split=0.3,
                    shuffle=True, callbacks=[reduce_lr,early_stopping,model_checkpoint])
#验证
scores = model.evaluate(x_eval, y_eval)

#保存模型
yaml_string = model.to_yaml()
with open('./model/textCNN.yml', 'w') as outfile:
    outfile.write( yaml.dump(yaml_string, default_flow_style=True))
model.save_weights('./model/textCNN.h5')

print('test_loss: %f, accuracy: %f' % (scores[0], scores[1]))

构建模型...
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 200, 150)     6684450     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 198, 30)      13530       embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 197, 30)      18030       embedding[0][0]                  
______________________________________________________________________________________

5832

test_loss: 0.200609, accuracy: 0.926850
