In [1]:
import os
import tarfile
import urllib.request

import tensorflow as tf
import numpy as np

import re
import string
from random import randint

In [2]:
url = "http://ai.stanford.edu/~ammas/data/sentiment/aclImdb_v1.tar.gz"
filepath = "data/aclImdb_v1.tar.gz"

if not os.path.exists("data"):
    os.makedirs("data")
    
if not os.path.isfile(filepath):
    result = urllib.request.urlretrieve(url,filepath)
    
if not os.path.exists("data/aclImdb"):
    result = tarfile.open(filepath,"r:gz").extractall("data/")

In [3]:
def remove_tags(text):
    re_tags = re.compile(r'<[^>]+>')
    return re_tags.sub('',text)

In [6]:
def read_files(filetype):
    path = "data/aclImdb/"
    file_list = []
    
    positive_path = path + filetype + "/pos/"
    for file in os.listdir(positive_path):
        file_list += [positive_path + file]
    pos_files_num = len(file_list)
    
    negative_path = path + filetype + "/neg/"
    for file in os.listdir(negative_path):
        file_list += [negative_path + file]
    neg_files_num = len(file_list) - pos_files_num
    
    all_labels = ([[1,0]]*pos_files_num + [[0,1]]*neg_files_num)
    
    all_texts = []
    for file in file_list:
        with open(file,encoding = 'utf8') as file_input:
            all_texts += [remove_tags("".join(file_input.readlines()))] 
    
    return all_labels, all_texts    

In [24]:
train_labels, train_texts = read_files("train")
test_labels, test_texts = read_files("test")

In [10]:
import keras

In [25]:
#create related dictionary "Token"
token = keras.preprocessing.text.Tokenizer(num_words = 4000)
token.fit_on_texts(train_texts)
#transfer texts into numbers
train_sequences = token.texts_to_sequences(train_texts)
test_sequences = token.texts_to_sequences(test_texts)
#let the length of transferred texts equal to 400
x_train = keras.preprocessing.sequence.pad_sequences(train_sequences, padding = 'post', truncating = 'post', maxlen = 400)
x_test = keras.preprocessing.sequence.pad_sequences(test_sequences, padding = 'post', truncating = 'post', maxlen = 400)
y_train = np.array(train_labels)
y_test = np.array(test_labels)

In [12]:
#settings
model = keras.models.Sequential()
model.add(keras.layers.Embedding(output_dim = 32, input_dim = 4000, input_length = 400))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(units = 256, activation = 'relu'))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(units = 2, activation = 'softmax'))

In [26]:
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
history = model.fit(x_train, y_train, validation_split = 0.2, epochs = 10, batch_size = 128, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
test_loss, test_acc = model.evaluate(x_test, y_test, verbose = 1)



In [28]:
review_text = "good nice enough"
input_seq = token.texts_to_sequences([review_text])
pad_input_seq = keras.preprocessing.sequence.pad_sequences(input_seq, padding = 'post', truncating = 'post', maxlen = 400)
pred = model.predict(pad_input_seq)

In [29]:
pred

array([[0.96413857, 0.03586143]], dtype=float32)