In [46]:
from nltk.corpus import stopwords
import string 
import re
from os import listdir
from collections import Counter
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

In [19]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [20]:
def clean_doc(doc, vocab):
    tokens = doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

In [21]:
def process_docs(directory, vocab, is_train):
    documents = list()
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        doc = load_doc(path)
        tokens = clean_doc(doc, vocab)
        documents.append(tokens)
    return documents

In [23]:
def load_clean_dataset(vocab, is_train):
    neg = process_docs('datasets/review_polarity/txt_sentoken/neg', vocab, is_train)
    pos = process_docs('datasets/review_polarity/txt_sentoken/pos', vocab, is_train)
    docs = neg + pos
    labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
    return docs, labels

In [29]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [35]:
def encode_docs(tokenizer, max_length, docs):
    encoded = tokenizer.texts_to_sequences(docs)
    padded = pad_sequences(encoded, maxlen = max_length, padding = 'post')
    return padded

In [39]:
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length = max_length))
    model.add(Conv1D(filters=32, kernel_size =8, activation = 'relu'))
    model.add(MaxPooling1D(pool_size = 2))
    model.add(Flatten())
    model.add(Dense(10, activation = 'relu'))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    model.summary()
#     plot_model(model, to_file = 'datasets/model.png', show_shapes = True)
    return model

In [30]:
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

In [32]:
train_docs, ytrain = load_clean_dataset(vocab, True)
tokenizer = create_tokenizer(train_docs)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)

Vocabulary size: 25901


In [33]:
max_length = max([len(s.split()) for s in train_docs])
print('Maximum length: %d' % max_length)

Maximum length: 2200


In [41]:
Xtrain = encode_docs(tokenizer, max_length, train_docs)
model = define_model(vocab_size, max_length)
model.fit(Xtrain, ytrain, epochs = 10, verbose = 2)
model.save('datasets/model_sentiment_cnn.h2')

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 2200, 100)         2590100   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 2193, 32)          25632     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 1096, 32)          0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 35072)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 10)                350730    
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 11        
Total params: 2,966,473
Trainable params: 2,966,473
Non-trainable params: 0
____________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
 - 13s - loss: 0.6916 - accuracy: 0.5300
Epoch 2/10
 - 14s - loss: 0.5704 - accuracy: 0.7033
Epoch 3/10
 - 14s - loss: 0.1665 - accuracy: 0.9439
Epoch 4/10
 - 14s - loss: 0.0149 - accuracy: 0.9994
Epoch 5/10
 - 14s - loss: 0.0035 - accuracy: 1.0000
Epoch 6/10
 - 14s - loss: 0.0019 - accuracy: 1.0000
Epoch 7/10
 - 14s - loss: 0.0013 - accuracy: 1.0000
Epoch 8/10
 - 14s - loss: 9.1366e-04 - accuracy: 1.0000
Epoch 9/10
 - 14s - loss: 6.8575e-04 - accuracy: 1.0000
Epoch 10/10
 - 14s - loss: 5.3036e-04 - accuracy: 1.0000


In [42]:
#Evaluate Model

In [48]:
def predict_sentiment(review, vocab, tokenizer, max_length, model):
    line = clean_doc(review, vocab)
    padded = encode_docs(tokenizer, max_length, [line])
    yhat = model.predict(padded, verbose = 0)
    percent_pos = yhat[0,0]
    if round(percent_pos) == 0:
        return (1-percent_pos), 'NEGATIVE'
    return percent_pos, 'POSITIVE'

In [54]:
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)

In [55]:
train_docs, ytrain = load_clean_dataset(vocab, True)
tokenizer = create_tokenizer(train_docs)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)

Vocabulary size: 25901


In [56]:
max_length = max([len(s.split()) for s in train_docs])
print('Maximum length: %d' % max_length)

Maximum length: 2200


In [57]:
Xtrain = encode_docs(tokenizer, max_length, train_docs)
Xtest = encode_docs(tokenizer, max_length, test_docs)

In [58]:
model = load_model('datasets/model_sentiment_cnn.h2')
_, acc = model.evaluate(Xtrain, ytrain, verbose = 0)
#evaluate on training dataset
print('Train Accuracy: %f' % (acc*100))
#evaluate on test dataset
_, acc = model.evaluate(Xtest, ytest, verbose = 0)
print('Test Accuracy: %f' % (acc*100))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train Accuracy: 100.000000
Test Accuracy: 86.500001


In [59]:
text = 'Everyone will enjoy this film. I love it. recommend!'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [Everyone will enjoy this film. I love it. recommend!]
Sentiment: POSITIVE (54.738%)


In [60]:
text = 'This is a bad movie. Do not watch it. It sucks.' 
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model) 
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [This is a bad movie. Do not watch it. It sucks.]
Sentiment: NEGATIVE (53.733%)
