In [1]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [2]:
import os
import glob
import pandas as pd
dn = os.path.split(dataset)[0]

def read_file(fn):
    with open(fn, "r", encoding="utf-8") as f:
        content = f.read()
    return content

def get_data(dn):
    posfn = glob.glob("{}/pos/*".format(dn))
    negfn = glob.glob("{}/neg/*".format(dn))
    postarget = [1] * len(posfn)
    negtarget = [0] * len(negfn)
    df = pd.DataFrame({
        "fname":posfn + negfn,
        "target":postarget + negtarget
    })
    df["content"] = df["fname"].apply(read_file)
    return df

train_df = get_data(os.path.join(dn, "aclImdb", "train"))
train_df

Unnamed: 0,fname,target,content
0,/root/.keras/datasets/aclImdb/train/pos/4971_7...,1,This is an absurdist dark comedy from Belgium....
1,/root/.keras/datasets/aclImdb/train/pos/9502_8...,1,I liked this movie very much. Although this mo...
2,/root/.keras/datasets/aclImdb/train/pos/10650_...,1,"I rented this movie this past weekend, cranked..."
3,/root/.keras/datasets/aclImdb/train/pos/1683_7...,1,I saw this film as it was the second feature o...
4,/root/.keras/datasets/aclImdb/train/pos/5784_8...,1,I've noticed that a lot of people who post on ...
...,...,...,...
24995,/root/.keras/datasets/aclImdb/train/neg/6299_1...,0,This inept adaptation of arguably one of Marti...
24996,/root/.keras/datasets/aclImdb/train/neg/2270_1...,0,The only thing that kept me from vomiting afte...
24997,/root/.keras/datasets/aclImdb/train/neg/3347_3...,0,I am not sure why I like Dolph Lundgren. I gue...
24998,/root/.keras/datasets/aclImdb/train/neg/9728_3...,0,Tobe Hooper has made great movies so I was cer...


In [3]:
test_df = get_data(os.path.join(dn, "aclImdb", "test"))
test_df

Unnamed: 0,fname,target,content
0,/root/.keras/datasets/aclImdb/test/pos/1010_9.txt,1,Hehehe. This was one of the best funny road mo...
1,/root/.keras/datasets/aclImdb/test/pos/10037_7...,1,"This movie is good for entertainment purposes,..."
2,/root/.keras/datasets/aclImdb/test/pos/10135_8...,1,Corniness Warning. As many fellow IMDb users a...
3,/root/.keras/datasets/aclImdb/test/pos/11555_7...,1,This movie starts off somewhat slowly and gets...
4,/root/.keras/datasets/aclImdb/test/pos/3466_10...,1,I got some free tickets via the Times to see t...
...,...,...,...
24995,/root/.keras/datasets/aclImdb/test/neg/5389_4.txt,0,"It was definitely worth viewing, I don't regre..."
24996,/root/.keras/datasets/aclImdb/test/neg/8021_1.txt,0,I can confidently say that this is the worst f...
24997,/root/.keras/datasets/aclImdb/test/neg/88_3.txt,0,"Wow, what an overrated movie this turned out t..."
24998,/root/.keras/datasets/aclImdb/test/neg/2270_1.txt,0,The comparisons between the 1995 version and t...


In [4]:
# tokenize: 數到底有多少種詞
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(train_df["content"])

Using TensorFlow backend.


In [0]:
# 可以檢視tokenizer成果
# (0不用) 拿來做padding
tokenizer.word_index
tokenizer.index_word

In [6]:
# Step1. 把文字轉成數字
x_train_tok = tokenizer.texts_to_sequences(train_df["content"])
x_test_tok = tokenizer.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_tok)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1671,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682,1683,1684,1685,1686,1687,1688,1689,1690,1691,1692,1693,1694,1695,1696,1697,1698,1699,1700,1701,1702,1703,1704,1705,1706,1707,1708,1709,1710
0,11,6,32,462,209,36,321,946,8.0,325.0,2.0,425.0,129.0,906.0,6.0,20.0,475.0,808.0,14.0,1.0,1609.0,333.0,4.0,3.0,220.0,8.0,3.0,389.0,510.0,6.0,3.0,34.0,364.0,16.0,24.0,182.0,574.0,1314.0,5.0,190.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,10,420,11,17,52,73,258,11,17.0,149.0,4.0,191.0,39.0,57.0,570.0,1429.0,91.0,52.0,1217.0,91.0,28.0,4.0,145.0,231.0,49.0,118.0,22.0,121.0,12.0,27.0,40.0,475.0,8.0,1.0,127.0,58.0,511.0,133.0,6.0,16.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,10,1605,11,17,11,498,53,1,478.0,1504.0,2.0,185.0,46.0,84.0,478.0,36.0,315.0,11.0,17.0,6.0,3.0,84.0,17.0,1.0,315.0,118.0,192.0,5.0,58.0,18.0,10.0,1918.0,46.0,605.0,1218.0,10.0,158.0,137.0,5.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,10,216,11,19,14,9,13,1,330.0,788.0,20.0,3.0,1.0,371.0,1600.0,14.0,538.0,13.0,1092.0,370.0,10.0,63.0,283.0,1014.0,73.0,36.0,11.0,19.0,18.0,162.0,9.0,59.0,303.0,12.0,1153.0,164.0,1794.0,44.0,306.0,11.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,204,1918,12,3,173,4,81,34,1203.0,20.0,1.0,303.0,5.0,780.0,11.0,120.0,60.0,10.0,162.0,166.0,52.0,1763.0,10.0,101.0,42.0,28.0,4.0,1.0,115.0,695.0,284.0,47.0,6.0,42.0,3.0,899.0,42.0,274.0,85.0,42.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,11,1252,4,28,4,1592,994,5,57.0,16.0,82.0,138.0,14.0,15.0,3.0,922.0,39.0,230.0,31.0,14.0,9.0,5.0,1194.0,723.0,9.0,6.0,3.0,1226.0,1069.0,39.0,3.0,817.0,7.0,7.0,523.0,31.0,1018.0,8.0,117.0,3.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24996,1,61,151,12,826,69,36,100,316.0,11.0,17.0,13.0,1.0,189.0,12.0,131.0,23.0,40.0,153.0,2.0,21.0,1.0,120.0,12.0,630.0,737.0,20.0,1.0,245.0,120.0,11.0,6.0,79.0,1.0,290.0,279.0,12.0,448.0,4.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24997,10,241,21,249,135,10,37,10,479.0,316.0,87.0,20.0,265.0,163.0,69.0,231.0,12.0,256.0,34.0,492.0,251.0,67.0,4.0,673.0,12.0,6.0,3.0,49.0,544.0,15.0,29.0,4.0,175.0,34.0,580.0,673.0,46.0,4.0,1.0,82.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24998,44,90,84,99,35,10,13,810,11.0,423.0,27.0,75.0,10.0,158.0,329.0,98.0,855.0,2.0,800.0,5.0,103.0,11.0,1977.0,19.0,30.0,208.0,11.0,90.0,69.0,459.0,515.0,10.0,217.0,1577.0,515.0,90.0,69.0,217.0,1409.0,15.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [7]:
# Step2. 截長補短
from keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_tok, maxlen=200)
x_test_pad = pad_sequences(x_test_tok, maxlen=200)
pd.DataFrame(x_train_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,443,118,26,8,105,6,16,747,3,6,3,819,14,26,205,117,1,312,1,1848,586,60,217,1898,8,24,338,466,1,19,23,37,1213,42,1217,2,70,287,126,55
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,4,191,39,57,570,1429,91,52,1217,91,28,4,145,231,49,118,22,121,12,27,40,475,8,1,127,58,511,133,6,16,1,893,170,10,964,11,17,30,690,454
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,3,84,17,1,315,118,192,5,58,18,10,1918,46,605,1218,10,158,137,5,1,99,2,64,11,18,91,3,626,519,311,89,231,37,167,43,17,10,59,383,9
3,63,283,1014,73,36,11,19,18,162,9,59,303,12,1153,164,1794,44,306,11,55,14,4,538,8,402,2,1404,14,3,50,71,4,1189,1,111,6,162,176,726,5,...,3,1235,49,289,16,1,380,136,529,14,88,23,640,441,11,6,21,3,84,19,18,42,1326,5,1268,5,1,1211,334,2,211,3,73,1927,71,1,125,570,2,486
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1853,32,967,2,3,4,24,365,23,34,1229,5,2,1730,34,6,603,2,1,696,106,466,1,120,46,102,25,314,2,159,659,25,213,8,18,42,207,74,438,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,22,121,614,118,11,6,167,7,7,48,6,3,1016,281,37,720,397,8,11,908,395,90,5,24,5,2,28,67,27,12,66,26,74,11,1240,147,25,1237,1192,795
24996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,4,75,17,228,2,92,211,57,430,10,525,435,126,55,11,17,8,1034,1586,9,5,132,12,10,1,335,367,81,34,66,5,140,146,11,1145,1651,1,83,55
24997,565,1170,33,79,112,844,1,1666,39,95,39,78,230,331,16,1,191,600,7,7,15,46,279,33,1194,33,25,5,513,1,359,2,1,1750,57,148,51,1,1911,190,...,216,1,17,10,276,66,221,9,14,46,240,4,154,15,46,19,393,118,26,6,1,1750,37,276,11,17,13,65,127,4,1,288,9,13,3,5,103,9,18,10
24998,37,33,328,188,760,50,180,8,11,17,18,33,119,7,7,46,373,11,6,46,429,4,18,368,215,42,445,20,12,323,18,1582,29,11,6,4,186,209,912,557,...,129,85,33,97,513,87,54,33,188,1256,87,54,2,10,479,1582,192,5,373,10,47,6,1242,50,912,180,8,11,17,7,7,70,45,22,178,49,917,103,11,28


In [8]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Flatten, Dense, Dropout

INPUT_DIM = 2001
EMBEDDING_DIM = 64
INPUT_LENGTH = 200

model = Sequential()
# 2000(種) + 1(0:padding)
model.add(Embedding(INPUT_DIM, 
                    EMBEDDING_DIM, 
                    input_length=INPUT_LENGTH))
model.add(Flatten())
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(1, activation="sigmoid"))
model.summary()





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 64)           128064    
_________________________________________________________________
flatten_1 (Flatten)          (None, 12800)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               1638528   
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 1,766,721
Trainable params: 1,766,721
Non-trainable params: 0
_________________________________________________________

In [9]:
model.compile("adam", 
              loss="binary_crossentropy", 
              metrics=["accuracy"])



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [12]:
import numpy as np
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
checkpoint = ModelCheckpoint("imdb.h5", save_best_only=True)
earlystop = EarlyStopping(patience=3, restore_best_weights=True)
model.fit(x_train_pad, 
          np.array(train_df["target"]),
          batch_size=200,
          epochs=100,
          validation_split=0.1,
          callbacks=[checkpoint, earlystop],
          verbose=2)




Train on 22500 samples, validate on 2500 samples
Epoch 1/100





 - 10s - loss: 0.5057 - acc: 0.7310 - val_loss: 0.3720 - val_acc: 0.8384
Epoch 2/100
 - 1s - loss: 0.2218 - acc: 0.9124 - val_loss: 0.2849 - val_acc: 0.8864
Epoch 3/100
 - 1s - loss: 0.0873 - acc: 0.9752 - val_loss: 0.4210 - val_acc: 0.8528
Epoch 4/100
 - 1s - loss: 0.0231 - acc: 0.9965 - val_loss: 0.6006 - val_acc: 0.8284
Epoch 5/100
 - 1s - loss: 0.0074 - acc: 0.9994 - val_loss: 0.8565 - val_acc: 0.7852


<keras.callbacks.History at 0x7f149afe22b0>

In [13]:
model.evaluate(x_test_pad, np.array(test_df["target"]))



[0.3307572071647644, 0.85768]