In [1]:
import tarfile
import os
if not os.path.exists("imdb"):
    tar = tarfile.open("aclImdb_v1.tar.gz")
    tar.extractall("imdb")
    tar.close()

In [2]:
import glob
import pandas as pd

base = "imdb/aclImdb/train/"

postxt = []
pos = glob.glob(base + "pos/*.txt")
for p in pos:
    with open(p, "r", encoding="utf-8") as f:
        postxt.append(f.read())

negtxt = []
neg = glob.glob(base + "neg/*.txt")
for n in neg:
    with open(n, "r", encoding="utf-8") as f:
        negtxt.append(f.read())
        
train = pd.DataFrame({
    "content":postxt + negtxt,
    "sentiment":[1] * len(postxt) + [0] * len(negtxt)
})
train

Unnamed: 0,content,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1
...,...,...
24995,"Towards the end of the movie, I felt it was to...",0
24996,This is the kind of movie that my enemies cont...,0
24997,I saw 'Descent' last night at the Stockholm Fi...,0
24998,Some films that you pick up for a pound turn o...,0


In [3]:

base = "imdb/aclImdb/test/"

postxt = []
pos = glob.glob(base + "pos/*.txt")
for p in pos:
    with open(p, "r", encoding="utf-8") as f:
        postxt.append(f.read())

negtxt = []
neg = glob.glob(base + "neg/*.txt")
for n in neg:
    with open(n, "r", encoding="utf-8") as f:
        negtxt.append(f.read())
        
test = pd.DataFrame({
    "content":postxt + negtxt,
    "sentiment":[1] * len(postxt) + [0] * len(negtxt)
})
test

Unnamed: 0,content,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
...,...,...
24995,I occasionally let my kids watch this garbage ...,0
24996,When all we have anymore is pretty much realit...,0
24997,The basic genre is a thriller intercut with an...,0
24998,Four things intrigued me as to this film - fir...,0


In [37]:
# Step1. 幫你列出所有出現過的詞
from keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(train["content"])

In [5]:
# 如果你要看詞轉化的數字
# tok.word_index
# 根據出現次數做一個排序: 
# sorted(tok.word_counts.items(), key=lambda x:x[1], reverse=True)

In [6]:
# Step2. 根據剛才統計的辭典, 把每一個詞化成一個數字
# !!! 2000以外的詞都丟掉了
x_train_num = tok.texts_to_sequences(train["content"])
x_test_num = tok.texts_to_sequences(test["content"])
pd.DataFrame(x_train_num)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1701,1702,1703,1704,1705,1706,1707,1708,1709,1710
0,309,6,3,1069,209,9,30,1,169.0,55.0,...,,,,,,,,,,
1,39,14,739,44,74,32,1829,15,150.0,18.0,...,,,,,,,,,,
2,526,117,113,31,1957,115,902,758,10.0,25.0,...,,,,,,,,,,
3,11,6,711,1,88,19,1,249,91.0,9.0,...,,,,,,,,,,
4,11,6,21,1,797,19,9,13,73.0,326.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,946,1,127,4,1,17,10,417,9.0,13.0,...,,,,,,,,,,
24996,11,6,1,240,4,17,12,58,1496.0,10.0,...,,,,,,,,,,
24997,10,216,233,311,30,1,19,1411,2.0,9.0,...,,,,,,,,,,
24998,46,105,12,22,1259,53,15,3,468.0,43.0,...,,,,,,,,,,


In [7]:
# (optional) 你可以查查看每一個數字代表什麼詞
reverse_index = {v:k for k, v in tok.word_index.items()}
reverse_index[6]

'is'

In [8]:
# Step3. 把每一篇文章截長補短變成一樣多的詞數
# 截長: 後面截的
# 補短: 補0
from keras.preprocessing.sequence import pad_sequences
x_train_num_pad = pad_sequences(x_train_num, maxlen=200)
x_test_num_pad = pad_sequences(x_test_num, maxlen=200)
pd.DataFrame(x_train_num_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0,0,0,0,0,0,0,0,...,101,12,309,6,227,48,3,12,9,215
1,1,1983,15,501,206,1,45,26,67,78,...,7,39,276,11,19,77,22,5,335,405
2,0,0,0,0,0,0,0,0,0,0,...,125,254,55,10,64,9,60,6,176,396
3,0,0,0,0,0,0,0,0,0,0,...,3,164,2,561,21,35,73,14,3,482
4,0,0,0,0,0,0,0,0,0,0,...,8,1,17,18,2,196,253,65,528,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,156,18,10,112,194,41,9,769,3,5,...,845,18,9,13,250,5,103,3,411,76
24996,0,0,0,0,0,0,0,0,0,0,...,302,103,3,539,507,41,1568,37,4,1
24997,806,11,17,43,58,1172,900,1227,5,1,...,54,10,40,423,76,80,11,17,96,75
24998,0,46,105,12,22,1259,53,15,3,468,...,18,1138,278,2,131,105,8,260,1195,794


In [44]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Flatten, Dense, Dropout
model = Sequential()
model.add(Embedding(2000, 32, input_length=200))
model.add(Flatten())
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 200, 32)           64000     
_________________________________________________________________
flatten_5 (Flatten)          (None, 6400)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 256)               1638656   
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 257       
Total params: 1,702,913
Trainable params: 1,702,913
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

In [22]:
y_train = train["sentiment"]
y_test = test["sentiment"]

In [23]:
model.fit(x_train_num_pad, y_train,
          batch_size=200, 
          epochs=1,
          validation_split=0.1, 
          verbose=2)

Train on 22500 samples, validate on 2500 samples
Epoch 1/1
 - 4s - loss: 0.5033 - acc: 0.7352 - val_loss: 0.3182 - val_acc: 0.8624


<keras.callbacks.History at 0x1eace00c7c8>

In [24]:
model.evaluate(x_test_num_pad, y_test)



[0.321740517244339, 0.85964]

In [35]:
# 挑一篇預測錯誤的來看
import numpy as np
pre = model.predict_classes(x_test_num_pad).reshape(-1)
y_test_np = np.array(y_test).reshape(-1)
noneq = pre != y_test_np
idx = np.nonzero(noneq)[0]
first = idx[0]
s = ["負面", "正面"]
print("原本情緒:", s[y_test_np[first]])
print("預測情緒:", s[pre[first]])
print("原文:", test["content"][first])


原本情緒: 正面
預測情緒: 負面
原文: Even if you're a fan of Jean Rollin's idiosyncratic body of work, you will be caught off guard by this exceptional foray into science fiction territory. For once, there's not a single diaphanously gowned vampire girl in sight ! True to tradition, the budget proved way too tight to realize the director's vision entirely. Yet this is largely compensated by his obvious love of genre cinema, dedication to his craft and sheer ingenuity. Jean-Claude Couty's atmospheric cinematography makes the most of the foreboding locations and Philippe Bréjean (a/k/a "Gary Sandeur") contributes a startling soundtrack that fortunately doesn't resemble any of the sappy stuff he composed for hardcore.<br /><br />Shot in and around a Paris office block before and after working hours, the film was largely cast with porn regulars Rollin was already quite familiar with from his "Michel Gentil" cash-gathering XXX efforts, most notably French f*ck film royalty Brigitte Lahaie in the demanding

In [36]:
from sklearn.metrics import confusion_matrix
s = ["負面", "正面"]
row = [c + "(答案)" for c in s]
col = [c + "(預測)" for c in s]
pd.DataFrame(confusion_matrix(y_test_np, pre),
             columns=col,
             index=row)

Unnamed: 0,負面(預測),正面(預測)
負面(答案),10654,1846
正面(答案),1663,10837


In [79]:
# 拿出副產物是大部分語言模型會做的事
w = model.layers[0].get_weights()
from keras.models import Sequential
embed = Sequential()
embed.add(Embedding(2000, 32, input_length=1))
embed.layers[0].set_weights(w)
embed.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 1, 32)             64000     
Total params: 64,000
Trainable params: 64,000
Non-trainable params: 0
_________________________________________________________________


In [88]:
import random
c = random.randint(1, 2000)
print("詞:", reverse_index[c])
print(embed.predict([c]))

詞: band
[[[-0.03678256  0.03467375  0.0136328   0.03134776  0.02761905
    0.02330947 -0.04993657 -0.02100598 -0.03225459  0.00487366
    0.01979636 -0.03665241  0.03823897  0.03968673 -0.00142444
   -0.04762316 -0.0476406   0.04715921  0.01021596 -0.04522716
    0.03873352 -0.04564768  0.00658088 -0.03555476  0.00324341
    0.02172791  0.0449718   0.02711388  0.01002748  0.03876592
   -0.03908352  0.04889998]]]


In [94]:
# 如果我採用序列式(時間相關)模型
# 情緒分析的時候其實你用不用RNN影響不大
from keras.layers import SimpleRNN
model = Sequential()
model.add(Embedding(2000, 32, input_length=200))
model.add(SimpleRNN(16))
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 200, 32)           64000     
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 16)                784       
_________________________________________________________________
dense_12 (Dense)             (None, 256)               4352      
_________________________________________________________________
dropout_7 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 257       
Total params: 69,393
Trainable params: 69,393
Non-trainable params: 0
_________________________________________________________________


In [95]:
model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

In [97]:
model.fit(x_train_num_pad, y_train,
          batch_size=200, 
          epochs=1,
          validation_split=0.1, 
          verbose=2)

Train on 22500 samples, validate on 2500 samples
Epoch 1/1
 - 3s - loss: 0.3430 - acc: 0.8581 - val_loss: 0.3881 - val_acc: 0.8228


<keras.callbacks.History at 0x1ead57bfd48>

In [98]:
model.evaluate(x_test_num_pad, y_test)



[0.362357794675827, 0.8474]