In [1]:
# 下載IMDB資料集
from keras.utils import get_file
dataset = get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)
print("下載位址:", dataset)

Using TensorFlow backend.


Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
下載位址: /root/.keras/datasets/aclImdb.tar.gz


In [15]:
import glob
import textwrap
fn = glob.glob("/root/.keras/datasets/aclImdb/train/neg/*")[1]
with open(fn, "r", encoding="utf-8") as f:
    review = f.read()
print("\n".join(textwrap.wrap(review, width=70)))

This Santa movie starts off strange and I think Santa might be a pedo.
Instead of the usual elf toy makers, this Santa has apparently
kidnapped kids from all across the globe and makes them sing a bit
like characters from "It's a Small World"! I guess there are no child
labor laws on the weird astral plane on which he lives (it's
apparently NOT the North Pole and not on Earth)!! None of these kids
seem very happy and I kept wanting to see commandos break in and
rescue the tykes, though I guess for some of the third world kids,
these working conditions were perhaps an improvement over local
sweatshops. I sure hope that all they do is sing and make toys.<br
/><br />Then, the scene abruptly changes to Hell where lots and lots
of demons dance about like they are in a Busby Berkeley musical. This
fun in put to a stop by Satan who orders one of them, Pitch, to go to
Earth to ruin Christmas!! Personally, I thought this movie already did
that! The Devil and his imps are actually kind of cute--

In [69]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Flatten, Dense, Dropout
from keras.layers import SimpleRNN
model = Sequential()
# 1 padding + 2000個詞 = 2001
model.add(Embedding(2001, 64, input_length=200))
model.add(SimpleRNN(32))
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 200, 64)           128064    
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, 32)                3104      
_________________________________________________________________
dense_14 (Dense)             (None, 256)               8448      
_________________________________________________________________
dropout_7 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 257       
Total params: 139,873
Trainable params: 139,873
Non-trainable params: 0
_________________________________________________________________


In [28]:
import os
import pandas as pd
def get_content(dirname, target):
    fn = glob.glob("{}/*".format(dirname))
    content = []
    targets = [target] * len(fn)
    for fp in fn:
        with open(fp, "r", encoding="utf-8") as f:
            content.append(f.read())
    return (content, targets)

dirname = os.path.dirname(dataset)
neg_path = os.path.join(dirname, "aclImdb", "train", "neg")
pos_path = os.path.join(dirname, "aclImdb", "train", "pos")
neg_content, neg_target = get_content(neg_path, 0)
pos_content, pos_target = get_content(pos_path, 1)
train_df = pd.DataFrame({
    "content":neg_content + pos_content,
    "target":neg_target + pos_target
})
train_df

Unnamed: 0,content,target
0,One of the previous reviewers wrote that there...,0
1,This Santa movie starts off strange and I thin...,0
2,I saw this last week after picking up the DVD ...,0
3,i completely agree with jamrom4.. this was the...,0
4,Imagine that you are asked by your date what m...,0
...,...,...
24995,"It was ""The Night HE Came Home,"" warned the po...",1
24996,"In War, Inc we find the logical extension of t...",1
24997,Forget the jaded comments that come before the...,1
24998,This film to me is a very good film!!<br /><br...,1


In [29]:
neg_path = os.path.join(dirname, "aclImdb", "test", "neg")
pos_path = os.path.join(dirname, "aclImdb", "test", "pos")
neg_content, neg_target = get_content(neg_path, 0)
pos_content, pos_target = get_content(pos_path, 1)
test_df = pd.DataFrame({
    "content":neg_content + pos_content,
    "target":neg_target + pos_target
})
test_df

Unnamed: 0,content,target
0,Worst Movie I Have Ever Seen! 90 Minutes of ex...,0
1,"Help! Once again, Paul Schrader has sabotaged ...",0
2,Being stuck in bed with the flu and feeling to...,0
3,I recently watched this movie because I'm a bi...,0
4,"Don't get me wrong, I love the TV series of Le...",0
...,...,...
24995,I rarely even bother to watch comedic movies o...,1
24996,One of the most entertaining of all silent com...,1
24997,"conventional and superficial ,Claude´s portray...",1
24998,This is one of L&H's shorts most frequently ci...,1


In [0]:
# 文字預處理1:把所有文字化成數字
from keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=2000)
# fit_on_texts只是幫你找出有多少種的字(fit-transform的fit)
tok.fit_on_texts(train_df["content"])
# key:詞 value:等一下要轉換成的數字
# 0這個數字保留, 0是拿來做padding
tok.word_index

In [36]:
# texts_to_sequences:真的依照剛才做出的表幫你轉換完成所有詞(transform)
x_train = tok.texts_to_sequences(train_df["content"])
x_test = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1671,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682,1683,1684,1685,1686,1687,1688,1689,1690,1691,1692,1693,1694,1695,1696,1697,1698,1699,1700,1701,1702,1703,1704,1705,1706,1707,1708,1709,1710
0,28,4,1,956,1983,1037,12,47,1478.0,5.0,27.0,54.0,652.0,1559.0,15.0,4.0,116.0,62.0,28.0,444.0,9.0,39.0,1794.0,9.0,18.0,47.0,183.0,5.0,27.0,3.0,1730.0,4.0,466.0,1.0,4.0,297.0,5.0,155.0,15.0,69.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,11,17,514,122,676,2,10,101,235.0,27.0,3.0,302.0,4.0,1.0,641.0,1184.0,11.0,44.0,681.0,359.0,36.0,29.0,635.0,1.0,2.0,163.0,95.0,1938.0,3.0,224.0,37.0,102.0,36.0,42.0,3.0,389.0,179.0,10.0,479.0,47.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,10,216,11,233,1264,100,53,1,285.0,702.0,10.0,66.0,470.0,5.0,64.0,9.0,15.0,1552.0,1.0,111.0,52.0,1768.0,35.0,58.0,1384.0,13.0,84.0,5.0,132.0,1.0,219.0,10.0,194.0,1.0,482.0,281.0,13.0,52.0,1032.0,11.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,10,337,1038,16,11,13,1,683,88.0,524.0,17.0,10.0,25.0,123.0,107.0,592.0,9.0,13.0,391.0,10.0,13.0,21.0,5.0,64.0,9.0,2.0,10.0,293.0,9.0,551.0,41.0,155.0,231.0,80.0,1.0,1347.0,581.0,10.0,337.0,516.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,835,12,22,23,1800,31,126,1302,48.0,17.0,22.0,470.0,5.0,64.0,2.0,22.0,374.0,316.0,3.0,244.0,1768.0,1468.0,41.0,1.0,35.0,8.0,49.0,1801.0,22.0,383.0,316.0,12.0,17.0,9.0,6.0,1.0,808.0,100.0,29.0,2.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,9,13,1,311,26,382,341,1,15.0,305.0,608.0,228.0,186.0,353.0,267.0,8.0,3.0,389.0,295.0,510.0,184.0,1431.0,452.0,485.0,1024.0,5.0,1402.0,177.0,24.0,796.0,2.0,8.0,1.0,1772.0,29.0,38.0,365.0,8.0,244.0,1767.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24996,8,322,72,166,1,4,1,4,29.0,322.0,72.0,23.0,397.0,8.0,2.0,45.0,22.0,23.0,1078.0,16.0,1.0,4.0,2.0,22.0,23.0,457.0,341.0,5.0,1313.0,1.0,4.0,415.0,296.0,3.0,566.0,129.0,769.0,34.0,656.0,306.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24997,854,1,793,12,213,156,131,11,6.0,32.0,202.0,18.0,17.0,41.0,81.0,34.0,144.0,709.0,8.0,3.0,304.0,953.0,70.0,914.0,57.0,31.0,1174.0,15.0,256.0,34.0,490.0,5.0,232.0,139.0,2.0,581.0,650.0,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24998,11,19,5,69,6,3,52,49,19.0,7.0,7.0,10.0,25.0,3.0,1121.0,543.0,2.0,10.0,654.0,5.0,556.0,26.0,13.0,37.0,1513.0,844.0,10.0,437.0,96.0,12.0,47.0,6.0,157.0,785.0,8.0,1.0,617.0,16.0,1513.0,844.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [38]:
# 文字預處理2. 截長補短
from keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train, maxlen=200)
x_test_pad = pad_sequences(x_test, maxlen=200)
pd.DataFrame(x_train_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,4,1,956,1983,1037,12,47,...,6,1,400,15,2,21,26,301,3,321,30,116,62,15,49,2,5,65,408,244,71,508,95,7,7,11,17,6,15,1,451,4,28,475,608,9,13,844,83,17
1,114,28,436,177,92,301,55,43,147,5,190,456,4,1,1023,550,622,843,23,26,405,95,46,429,4,929,60,65,709,104,709,11,6,3,1278,417,118,1664,335,359,...,2,52,52,52,944,2,3,12,359,6,40,1041,42,3,19,22,141,112,120,5,359,18,163,3,84,19,5,103,16,365,35,22,67,459,30,9,36,377,5,1360
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,216,11,233,1264,100,53,1,285,702,10,66,470,5,64,9,15,1552,1,111,52,1768,35,...,28,4,1,290,5,2,22,67,373,26,124,116,24,434,18,10,59,25,420,3,125,957,36,138,3,49,860,7,7,10,559,1,285,142,5,1,1127,1,169,248
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,45,61,10,66,1,169,1706,588,604,620,36,11,17,10,25,3,544,145,34,1146,9,542,68,572,8,1,228,4,1,17,2,141,29,27,122,1,390,4,1,1220
4,23,1800,31,126,1302,48,17,22,470,5,64,2,22,374,316,3,244,1768,1468,41,1,35,8,49,1801,22,383,316,12,17,9,6,1,808,100,29,2,9,119,950,...,27,193,156,56,6,5,2,9,188,590,512,192,45,22,939,69,524,524,524,1,111,158,94,278,13,391,42,1731,12,1,223,274,13,5,25,3,1525,1,989,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,70,14,3,859,1989,1031,1,1472,197,95,2,11,6,176,328,12,6,1,115,7,7,90,20,3,52,349,1191,1,390,4,186,8,2,1,1490,509,4,394,31,3
24996,1207,253,31,2,31,1,117,1715,320,253,31,72,23,1722,5,498,60,1679,3,1518,12,44,87,123,234,2,1,769,34,162,6,617,1,433,15,87,2,253,31,1817,...,93,5,1,870,1172,7,7,45,22,37,126,209,2,1745,47,6,239,21,192,130,5,398,22,924,1,433,17,20,1,82,505,45,22,37,209,2,11,6,15,22
24997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,854,1,793,12,213,156,131,11,6,32,202,18,17,41,81,34,144,709,8,3,304,953,70,914,57,31,1174,15,256,34,490,5,232,139,2,581,650
24998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,543,2,10,654,5,556,26,13,37,1513,844,10,437,96,12,47,6,157,785,8,1,617,16,1513,844,2,8,95,10,89,456,48,98,28,132,131,104,105,68,318


In [0]:
model.compile(loss="binary_crossentropy",
              optimizer="adam",
               metrics=["accuracy"])

In [71]:
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
import numpy as np
check_callback = ModelCheckpoint("model.h5", 
                                 save_best_only=True)
stop_callback = EarlyStopping(patience=3,
                              restore_best_weights=True)
model.fit(x_train_pad, 
          np.array(train_df["target"]),
          batch_size=200,
          epochs=100,
          validation_split=0.1,
          verbose=2,
          callbacks=[check_callback, stop_callback])      

Train on 22500 samples, validate on 2500 samples
Epoch 1/100
 - 11s - loss: 0.5814 - acc: 0.6702 - val_loss: 0.5896 - val_acc: 0.7388
Epoch 2/100
 - 9s - loss: 0.3294 - acc: 0.8637 - val_loss: 0.3125 - val_acc: 0.8836
Epoch 3/100
 - 9s - loss: 0.2479 - acc: 0.9035 - val_loss: 0.2686 - val_acc: 0.9028
Epoch 4/100
 - 8s - loss: 0.1717 - acc: 0.9372 - val_loss: 0.4725 - val_acc: 0.8424
Epoch 5/100
 - 9s - loss: 0.0916 - acc: 0.9694 - val_loss: 0.6819 - val_acc: 0.7924
Epoch 6/100
 - 8s - loss: 0.0515 - acc: 0.9826 - val_loss: 0.7543 - val_acc: 0.8088


<keras.callbacks.History at 0x7fb1fddfcd30>

In [72]:
model.evaluate(x_test_pad, test_df["target"])



[0.38409213020324706, 0.85332]