In [1]:
import os
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
import re
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

In [3]:
def read_files(filetype):
    path = "data/aclImdb/"
    file_list = []
    positive_path = path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list += [positive_path + f]
    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]
    print('read', filetype, 'files:', len(file_list))
    all_labels = ([1] * 12500 + [0] * 12500)
    all_texts = []
    for fi in file_list:
        with open(fi, encoding='utf-8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
    return all_labels, all_texts

In [4]:
y_train, train_text = read_files("train")

read train files: 25000


In [5]:
y_test, test_text = read_files("test")

read test files: 25000


In [6]:
token = Tokenizer(num_words=2000)
token.fit_on_texts(train_text)

In [7]:
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

In [8]:
x_train = sequence.pad_sequences(x_train_seq, maxlen=200)
x_test = sequence.pad_sequences(x_test_seq, maxlen=200)

In [9]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN

In [10]:
model = Sequential()

In [11]:
model.add(Embedding(output_dim=32, input_dim=2000, input_length=200))

In [12]:
model.add(Dropout(0.2))

In [13]:
model.add(SimpleRNN(units=16))

In [14]:
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.2))

In [15]:
model.add(Dense(units=1, activation='sigmoid'))

In [16]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 32)           64000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 200, 32)           0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 16)                784       
_________________________________________________________________
dense_1 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 69,393
Trainable params: 69,393
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
tarin_history = model.fit(x_train, y_train, batch_size=100, epochs=10, verbose=2, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 21s - loss: 0.5128 - acc: 0.7436 - val_loss: 0.7199 - val_acc: 0.6830
Epoch 2/10
 - 19s - loss: 0.3472 - acc: 0.8564 - val_loss: 0.5553 - val_acc: 0.7314
Epoch 3/10
 - 19s - loss: 0.3082 - acc: 0.8775 - val_loss: 0.4913 - val_acc: 0.7834
Epoch 4/10
 - 19s - loss: 0.2760 - acc: 0.8911 - val_loss: 0.7454 - val_acc: 0.7212
Epoch 5/10
 - 19s - loss: 0.2405 - acc: 0.9036 - val_loss: 0.5819 - val_acc: 0.7864
Epoch 6/10
 - 19s - loss: 0.2002 - acc: 0.9234 - val_loss: 0.6299 - val_acc: 0.7732
Epoch 7/10
 - 20s - loss: 0.1671 - acc: 0.9367 - val_loss: 0.7329 - val_acc: 0.7808
Epoch 8/10
 - 19s - loss: 0.1359 - acc: 0.9505 - val_loss: 0.9566 - val_acc: 0.7348
Epoch 9/10
 - 19s - loss: 0.1177 - acc: 0.9557 - val_loss: 0.9892 - val_acc: 0.7636
Epoch 10/10
 - 20s - loss: 0.1039 - acc: 0.9620 - val_loss: 1.3137 - val_acc: 0.7212


In [19]:
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]



0.81136

In [None]:
predict = model.predict_classes(x_test)

In [None]:
predict[:10]

In [None]:
predict_classes = predict.reshape(-1)
predict_classes[:10]

In [None]:
SetimentDict = {1:'正面的', 0:'负面的'}
def display_test_Setiment(i):
    print(test_text[i])
    print('label真实值:', SetimentDict[y_test[i]], '预测结果:', SetimentDict[predict_classes[i]])

In [None]:
display_test_Setiment(2)

In [None]:
display_test_Setiment(12503)