# Preprocessing

In [3]:
import urllib.request
import os
import tarfile

url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
HOUSING_PATH = "data"
filepath = "data/aclImdb_v1.tar.gz"
if not os.path.isfile(filepath):
    if not os.path.isdir(HOUSING_PATH):
        os.makedirs(HOUSING_PATH)
    tgz_path = os.path.join(HOUSING_PATH, "aclImdb_v1.tar.gz")
    result = urllib.request.urlretrieve(url, tgz_path)
    print("downloaded:", result)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(HOUSING_PATH)
    housing_tgz.close()

downloaded: ('data/aclImdb_v1.tar.gz', <http.client.HTTPMessage object at 0x7f91cfee6400>)


NameError: name 'housing_path' is not defined

## Remove HTML tag

In [4]:
import re
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

In [5]:
import os
def read_files(filetype):
    path = "data/aclImdb/"
    file_list = []
    
    positive_path = path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list += [positive_path + f]

    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]
    
    print('read', filetype, 'files:', len(file_list))
    
    all_labels = ([1] * 12500 + [0] * 12500)
    all_texts = []
    
    for fi in file_list:
        with open(fi, encoding = 'utf8') as file_input:
            all_texts += [rm_tags("".join(file_input.readlines()))]
    
    return all_labels, all_texts

## Load Data

In [6]:
y_train, train_text = read_files("train")

read train files: 25000


In [7]:
y_test, test_text = read_files("test")

read test files: 25000


In [8]:
train_text[0]

'Zentropa has much in common with The Third Man, another noir-like film set among the rubble of postwar Europe. Like TTM, there is much inventive camera work. There is an innocent American who gets emotionally involved with a woman he doesn\'t really understand, and whose naivety is all the more striking in contrast with the natives.But I\'d have to say that The Third Man has a more well-crafted storyline. Zentropa is a bit disjointed in this respect. Perhaps this is intentional: it is presented as a dream/nightmare, and making it too coherent would spoil the effect. This movie is unrelentingly grim--"noir" in more than one sense; one never sees the sun shine. Grim, but intriguing, and frightening.'

## Token

In [14]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

token = Tokenizer(num_words=2000)
token.fit_on_texts(train_text)
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

In [15]:
print(train_text[0])

Zentropa has much in common with The Third Man, another noir-like film set among the rubble of postwar Europe. Like TTM, there is much inventive camera work. There is an innocent American who gets emotionally involved with a woman he doesn't really understand, and whose naivety is all the more striking in contrast with the natives.But I'd have to say that The Third Man has a more well-crafted storyline. Zentropa is a bit disjointed in this respect. Perhaps this is intentional: it is presented as a dream/nightmare, and making it too coherent would spoil the effect. This movie is unrelentingly grim--"noir" in more than one sense; one never sees the sun shine. Grim, but intriguing, and frightening.


In [16]:
print(x_train_seq[0])

[43, 72, 7, 1135, 15, 1, 835, 128, 155, 1353, 36, 18, 266, 787, 1, 4, 36, 46, 6, 72, 366, 153, 46, 6, 31, 1350, 294, 33, 210, 571, 15, 3, 251, 26, 148, 62, 387, 2, 619, 6, 28, 1, 49, 7, 15, 1, 17, 470, 24, 5, 131, 11, 1, 835, 128, 43, 3, 49, 69, 763, 6, 3, 223, 7, 10, 1157, 377, 10, 6, 8, 6, 1346, 13, 3, 920, 1719, 2, 227, 8, 95, 58, 1, 958, 10, 16, 6, 1353, 7, 49, 70, 27, 277, 27, 111, 1078, 1, 17, 1767, 2]


In [17]:
print(token.word_index)



In [18]:
# get rid of the text above 100
x_train = sequence.pad_sequences(x_train_seq, maxlen=100)
x_test = sequence.pad_sequences(x_test_seq, maxlen=100)

## Embding

In [19]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding

In [20]:
model = Sequential()

In [21]:
model.add(Embedding(output_dim=32,
                    input_dim=2000,
                    input_length=100))
model.add(Dropout(0.2))

## Add Flatten

In [22]:
model.add(Flatten())

## Add Hidden Layer

In [23]:
model.add(Dense(units=256,
                activation='relu'))
model.add(Dropout(0.35))

## Add Output Layer

In [24]:
model.add(Dense(units=1,
                activation='sigmoid'))

In [25]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               819456    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 883,713
Trainable params: 883,713
Non-trainable params: 0
_________________________________________________________________


## Training Model

In [26]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [27]:
train_history = model.fit(x_train, y_train, batch_size=100,
                          epochs=10, verbose=2,
                          validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 5s - loss: 0.4773 - acc: 0.7570 - val_loss: 0.4757 - val_acc: 0.7818
Epoch 2/10
 - 4s - loss: 0.2683 - acc: 0.8899 - val_loss: 0.4949 - val_acc: 0.7848
Epoch 3/10
 - 4s - loss: 0.1579 - acc: 0.9427 - val_loss: 0.6488 - val_acc: 0.7598
Epoch 4/10
 - 4s - loss: 0.0852 - acc: 0.9697 - val_loss: 0.7490 - val_acc: 0.7778
Epoch 5/10
 - 4s - loss: 0.0479 - acc: 0.9838 - val_loss: 0.8612 - val_acc: 0.7838
Epoch 6/10
 - 4s - loss: 0.0342 - acc: 0.9878 - val_loss: 1.1089 - val_acc: 0.7594
Epoch 7/10
 - 4s - loss: 0.0273 - acc: 0.9905 - val_loss: 1.4843 - val_acc: 0.7202
Epoch 8/10
 - 4s - loss: 0.0296 - acc: 0.9893 - val_loss: 1.3079 - val_acc: 0.7540
Epoch 9/10
 - 4s - loss: 0.0255 - acc: 0.9912 - val_loss: 1.4423 - val_acc: 0.7418
Epoch 10/10
 - 4s - loss: 0.0234 - acc: 0.9911 - val_loss: 1.7880 - val_acc: 0.6924


## Validation

In [28]:
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]



0.79944

In [29]:
predict = model.predict_classes(x_test)

In [30]:
predict[:10]

array([[1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]], dtype=int32)

In [31]:
predict_classes = predict.reshape(-1)
predict_classes[:10]

array([1, 1, 0, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

## Check Result

In [36]:
SentimentDict = {1:"正面的", 0:"負面的"}
def display_test_Sentiment(i):
    print(test_text[i])
    print('label真實 ： ', SentimentDict[y_test[i]],\
            '預測 ： ', SentimentDict[predict_classes[i]])

In [40]:
display_test_Sentiment(12502)

its a totally average film with a few semi-alright action sequences that make the plot seem a little better and remind the viewer of the classic van dam films. parts of the plot don't make sense and seem to be added in to use up time. the end plot is that of a very basic type that doesn't leave the viewer guessing and any twists are obvious from the beginning. the end scene with the flask backs don't make sense as they are added in and seem to have little relevance to the history of van dam's character. not really worth watching again, bit disappointed in the end production, even though it is apparent it was shot on a low budget certain shots and sections in the film are of poor directed quality
label真實 ：  負面的 預測 ：  負面的


# RNN part

In [41]:
token = Tokenizer(num_words=3800)
token.fit_on_texts(train_text)
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

x_train = sequence.pad_sequences(x_train_seq, maxlen=380)
x_test = sequence.pad_sequences(x_test_seq, maxlen=380)

In [51]:
from keras.layers.recurrent import SimpleRNN
model = Sequential()
model.add(Embedding(output_dim=32,
                    input_dim=3800,
                    input_length=380))
model.add(Dropout(0.35))
model.add(SimpleRNN(units=16))
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.35))
model.add(Dense(units=1, activation='sigmoid'))

In [52]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 380, 32)           121600    
_________________________________________________________________
dropout_7 (Dropout)          (None, 380, 32)           0         
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 16)                784       
_________________________________________________________________
dense_7 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_8 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 257       
Total params: 126,993
Trainable params: 126,993
Non-trainable params: 0
_________________________________________________________________


In [53]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
scores = model.evaluate(x_test, y_test, verbose = 1)
scores[1]



0.49556

In [47]:
from keras.layers.recurrent import LSTM
model = Sequential()
model.add(Embedding(output_dim=32,
                    input_dim=3800,
                    input_length=380))
model.add(Dropout(0.2))
model.add(LSTM(32))
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=1, activation='sigmoid'))

In [48]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 380, 32)           121600    
_________________________________________________________________
dropout_5 (Dropout)          (None, 380, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_5 (Dense)              (None, 256)               8448      
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 257       
Total params: 138,625
Trainable params: 138,625
Non-trainable params: 0
_________________________________________________________________


In [50]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
scores = model.evaluate(x_test, y_test, verbose = 1)
scores[1]



0.50948