In [1]:
import os
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
import re
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

In [3]:
def read_files(filetype):
    path = "data/aclImdb/"
    file_list = []
    positive_path = path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list += [positive_path + f]
    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]
    print('read', filetype, 'files:', len(file_list))
    all_labels = ([1] * 12500 + [0] * 12500)
    all_texts = []
    for fi in file_list:
        with open(fi, encoding='utf-8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
    return all_labels, all_texts

In [4]:
y_train, train_text = read_files("train")

read train files: 25000


In [5]:
y_test, test_text = read_files("test")

read test files: 25000


In [6]:
token = Tokenizer(num_words=2000)
token.fit_on_texts(train_text)

In [7]:
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

In [8]:
x_train = sequence.pad_sequences(x_train_seq, maxlen=100)
x_test = sequence.pad_sequences(x_test_seq, maxlen=100)

In [9]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding

In [10]:
model = Sequential()

In [11]:
model.add(Embedding(output_dim=32, input_dim=2000, input_length=100))
model.add(Dropout(0.2))

In [12]:
model.add(Flatten())

In [13]:
model.add(Dense(units=256, activation='relu'))

In [14]:
model.add(Dropout(0.35))

In [15]:
model.add(Dense(units=1, activation='sigmoid'))

In [16]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
tarin_history = model.fit(x_train, y_train, batch_size=100, epochs=10, verbose=2, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 2s - loss: 0.4704 - acc: 0.7631 - val_loss: 0.3899 - val_acc: 0.8278
Epoch 2/10
 - 1s - loss: 0.2615 - acc: 0.8922 - val_loss: 0.5926 - val_acc: 0.7412
Epoch 3/10
 - 1s - loss: 0.1572 - acc: 0.9420 - val_loss: 0.7424 - val_acc: 0.7334
Epoch 4/10
 - 1s - loss: 0.0785 - acc: 0.9741 - val_loss: 0.7487 - val_acc: 0.7764
Epoch 5/10
 - 1s - loss: 0.0457 - acc: 0.9831 - val_loss: 0.8554 - val_acc: 0.7864
Epoch 6/10
 - 1s - loss: 0.0346 - acc: 0.9883 - val_loss: 1.0565 - val_acc: 0.7684
Epoch 7/10
 - 1s - loss: 0.0310 - acc: 0.9896 - val_loss: 1.1964 - val_acc: 0.7548
Epoch 8/10
 - 1s - loss: 0.0246 - acc: 0.9914 - val_loss: 1.7215 - val_acc: 0.6958
Epoch 9/10
 - 1s - loss: 0.0229 - acc: 0.9916 - val_loss: 1.3167 - val_acc: 0.7582
Epoch 10/10
 - 1s - loss: 0.0267 - acc: 0.9900 - val_loss: 1.4399 - val_acc: 0.7394


In [18]:
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]



0.81368

In [19]:
predict = model.predict_classes(x_test)

In [20]:
predict[:10]

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0]], dtype=int32)

In [21]:
predict_classes = predict.reshape(-1)
predict_classes[:10]

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 0], dtype=int32)

In [22]:
SetimentDict = {1:'正面的', 0:'负面的'}
def display_test_Setiment(i):
    print(test_text[i])
    print('label真实值:', SetimentDict[y_test[i]], '预测结果:', SetimentDict[predict_classes[i]])

In [23]:
display_test_Setiment(2)

I have always loved the ironic symbolism and brilliant cinematography of Coppola's masterpiece. I was lucky enough to meet Martin Sheen outside the Santa Monica Civic Auditorium one night in 1981, as he waited for Charlie and Emilio to leave a concert. He was very humble about the praise I shared with him for this work of art, especially his portrayal of the young Captain. This is, without a doubt, a must see, a complete 10 and an important part of American Film History. "Charlie Don't Surf". Robert Duvall's famous line (the other one) does not need repeating as it has become an oft repeated anthem and his Pattonesque character will long be remembered as a classic American war hawk in the John Wayne tradition. It is a surprise to see how young Laurence Fishburne looks.
label真实值: 正面的 预测结果: 正面的


In [24]:
display_test_Setiment(12503)

I am not quite sure I agree with the director of this version of The Scarlet Pimpernel. I imagined Sir Percy Blakeney a very calm, seemingly lazy aristocrat. This particular Sir Percy Blakeney appears to be teeming with overwhelming energy and volatility. I did not appreciate the Houdini, James Bond, Mission Impossible style escapes that Sir Percy engineered either. In the previous versions, wit was the tool for escape, not technology. Neither were the characters of Marguerite and Chauvelin adequately portrayed. There seemed to be little energy or chemistry in the interaction between the characters.I do not wish to assign any blame, for perhaps the reason for my dislike of this movie might simply be a matter of difference in interpretation. Had the director's interpretation coincided with mine, perhaps I might not have been irritated by what seemed to me bad character portrayals.I much preferred the version from 1982. Anthony Andrews was quite efficient as the imperturbable, calm fop. 