In [55]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
import jieba
import codecs
import math
from sklearn.model_selection import train_test_split

In [85]:
stopwords = set()
def load_stopwords():
    f = codecs.open('stopwords.dat', 'r', 'utf-8')
    for line in f:
        line = line.strip()
        if line == '':
            continue
        stopwords.add(line)
load_stopwords()

In [86]:
def load_data(num_words):
    pos_file = 'pos_book_reviews.txt'
    neg_file = 'neg_book_reviews.txt'
    data = []
    label = []
    idf = {}
    f = codecs.open(pos_file, 'r', 'utf-8')
    lines = f.readlines()
    freq_dict = {}
    total_no = 0
    for line in lines:
        line = line.strip()
        seg_list = jieba.cut(line)
        local_words = set()
        total_no += 1
        local_sentence = []
        for seg in seg_list:
            seg = seg.strip()
            if seg in stopwords:
                continue
            if seg == '':
                continue
            freq_dict[seg] = freq_dict.get(seg, 0) + 1
            local_sentence.append(seg)
            if seg in local_words:
                continue
            idf[seg] = idf.get(seg, 0) + 1
            local_words.add(seg)
        data.append(local_sentence)
        label.append(1)
    f.close()
    f = codecs.open(neg_file, 'r', 'utf-8')
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        seg_list = jieba.cut(line)
        total_no += 1
        local_sentence = []
        for seg in seg_list:
            seg = seg.strip()
            if seg.strip() in stopwords:
                continue
            if seg.strip() == '':
                continue
            freq_dict[seg] = freq_dict.get(seg, 0) + 1
            local_sentence.append(seg)
            if seg in local_words:
                continue
            idf[seg] = idf.get(seg, 0) + 1
            local_words.add(seg)
        data.append(local_sentence)
        label.append(0)
    f.close()
    vocab = sorted(freq_dict.items(), key=lambda x:x[1], reverse=True)[:num_words]
    index2word = [x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    filter_data = []
    for sentence in data:
        local_sentence = []
        for word in sentence:
            if word in word2index:
                local_sentence.append(word2index[word])
        filter_data.append(local_sentence)
    X_train, X_test, y_train, y_test = train_test_split(filter_data, label, test_size=0.2, random_state=1)
    return (X_train, y_train), (X_test, y_test), index2word, word2index

In [141]:
top_words = 10000
(X_train, y_train), (X_test, y_test), index2word, word2index = load_data(num_words=top_words)

{'记不住': 4204, '安哥': 2798, '倒胃口': 5851, '有益': 1352, '释放': 3030, '翻到': 8889, '乱七八糟': 1817, '罗天': 4990, '化作': 5852, '会卖': 8539, '不上': 2799, '扩张': 4991, '一书': 1076, '班级': 7910, '利于': 8540, '神雕': 7335, '重合': 9866, '吸取': 4994, '改变': 146, '页码': 1708, 'In': 7607, '脚步': 4460, '性能': 9448, '眼睛': 428, '孩': 6752, '洗练': 7608, '高分': 5433, '幽默': 284, '明明白白': 7610, '胶装': 8551, '这册': 7758, '建国': 5853, '时间': 43, '这书': 194, '咬': 8542, '不合': 3951, '顽皮': 6753, '阿加莎': 3772, '香港': 1459, '我于': 5434, '心上人': 7407, '结下': 6613, '盲从': 9451, '居': 5854, '红酒': 6754, '放弃': 573, '轰轰烈烈': 2800, '物理学': 3800, '舒畅': 8677, '一身': 2709, '熏陶': 5716, '一年': 817, '母乳喂养': 5435, '不可多得': 1386, '星球': 5856, '加点': 5857, '激动': 1931, '偏向': 4589, '幅': 6167, '越长': 7911, '唐山': 1782, '作证': 7745, '大地': 3585, '能当': 7613, '试试看': 1490, '前几日': 6756, '正视': 3952, '灰尘': 5858, '惆怅': 5436, '客户': 1276, '提': 1783, '不愧': 1732, '工作者': 3258, '珍藏': 1293, '半年': 8999, '骗钱': 6757, '排泄': 9452, '两句': 4993, '牵着': 8544, '十分钟': 7202, '言中': 7657, '师': 710, '感召': 9453,

In [142]:
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [143]:
print(X_train)

[[   0    0    0 ...,  961  478  122]
 [   0    0    0 ...,  522   72  250]
 [   0    0    0 ...,  355 1133   15]
 ..., 
 [   0    0    0 ...,   13 2273  260]
 [   0    0    0 ...,   97 6711 6795]
 [   0    0    0 ...,   17  199 7344]]


In [144]:
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 500, 32)           320000    
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 500, 32)           3104      
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 101       
Total params: 376,405
Trainable params: 376,405
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x359fe630>

In [145]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 93.90%


In [146]:
def load_test():
    test_file = 'test.txt'
    f = codecs.open(test_file, 'r', 'utf-8')
    lines = f.readlines()
    data = []
    origin_data = []
    for line in lines:
        line = line.strip()
        seg_list = jieba.cut(line)
        local_sentence = []
        for seg in seg_list:
            seg = seg.strip()
            if seg in stopwords:
                continue
            if seg == '':
                continue
            if seg not in word2index:
                continue
            local_sentence.append(word2index[seg])
        data.append(local_sentence)
        origin_data.append(line)
    f.close()
    
    return data, origin_data

In [147]:
data, origin_data = load_test()

In [148]:
print(origin_data)

['比中学生作文强不了多少。论语是要用阅历来读懂的。易中天做托，可恶。', '应时浮华之作，商业气浓，医学水准有限。翻翻而已，千万别太认真。尽信书不如无书。', '阅读这本书使我对汉字设计发展文脉，有一个系统的认识，虽然作者没有展开讲解，给读者一个清晰的探索思路，犹如我们在汉子迷宫里找到了灯塔。祝愿作者在这一领域提供给我们学生更多的惊喜和汉字设计的学习路径。', '图文并茂，值得一读。', '好书好书好书好书好书好书！']


In [149]:
print(data)

[[1048, 920, 297, 291, 2378, 1359, 1791, 5482], [4529, 2021, 1759, 1906, 3292, 1233, 3579, 1448, 691, 3094, 14, 6354, 0, 9145], [27, 1624, 228, 215, 479, 8, 1566, 504, 59, 444, 2032, 918, 6290, 3002, 24, 115, 8, 835, 651, 314, 1188, 1624, 228, 44], [1770, 22, 511], [61, 61, 61, 61, 61, 61]]


In [150]:
test = sequence.pad_sequences(data, maxlen=max_review_length)

In [151]:
res = model.predict(test)

In [152]:
print(res)

[[ 0.00710161]
 [ 0.01510675]
 [ 0.99970299]
 [ 0.00577888]
 [ 0.59688824]]
