In [112]:
from nltk.corpus import stopwords
import string
import re
from collections import Counter

In [113]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [114]:
from os import listdir

def process_docs(directory):
    for filename in listdir(directory):
        if not filename.endswith(".txt"):
            next
        path = directory + '/' + filename
        doc = load_doc(path)
        print('Loaded %s' % filename)

In [115]:
def clean_doc(text):
    tokens = text.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [116]:
def add_doc_to_vocab(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    vocab.update(tokens)

In [117]:
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [118]:
def doc_to_line(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

In [119]:
def process_docs(directory, vocab, is_train):
    lines = list()
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and filename.startswith(".cv9"):
            continue
        path = directory + '/' + filename
        line = doc_to_line(path, vocab)
        lines.append(line)
    return line

In [120]:
negative_lines = process_docs('datasets/review_polarity/txt_sentoken/neg', vocab)
save_list(negative_lines, 'datasets/negative.txt')
positive_lines = process_docs('datasets/review_polarity/txt_sentoken/pos', vocab)
save_list(negative_lines, 'datasets/positive.txt')

TypeError: process_docs() missing 1 required positional argument: 'is_train'

In [121]:
def load_clean_dataset(vocab, is_train):
    neg = process_docs('datasets/review_polarity/txt_sentoken/neg', vocab, is_train)
    pos = process_docs('datasets/review_polarity/txt_sentoken/pos', vocab, is_train)
    docs = neg + pos
    labels = [0 for _ in range(len(neg))]+ [1 for _ in range(len(pos))]
    return docs, labels

In [122]:
vocab_filename = 'datasets/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [123]:
docs, labels = load_clean_dataset(vocab)
print(len(docs), len(labels))

TypeError: load_clean_dataset() missing 1 required positional argument: 'is_train'

In [124]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [125]:
vocab_filename = 'datasets/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

In [126]:
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)

In [127]:
tokenizer = create_tokenizer(train_docs)

In [128]:
Xtrain = tokenizer.texts_to_matrix(train_docs, mode='freq')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='freq')

In [129]:
print(Xtrain.shape, Xtest.shape)

(3062, 27) (5255, 27)


In [130]:
n_words = Xtest.shape[1]

In [131]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense

In [132]:
def define_model(n_words):
    model = Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation='relu'))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(loss= 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    model.summary()
    return model

In [133]:
model = define_model(n_words)
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

Model: "sequential_124"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_246 (Dense)            (None, 50)                1400      
_________________________________________________________________
dense_247 (Dense)            (None, 1)                 51        
Total params: 1,451
Trainable params: 1,451
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
 - 0s - loss: 0.6823 - accuracy: 0.5810
Epoch 2/10
 - 0s - loss: 0.6797 - accuracy: 0.5849
Epoch 3/10
 - 0s - loss: 0.6786 - accuracy: 0.5849
Epoch 4/10
 - 0s - loss: 0.6782 - accuracy: 0.5849
Epoch 5/10
 - 0s - loss: 0.6778 - accuracy: 0.5849
Epoch 6/10
 - 0s - loss: 0.6776 - accuracy: 0.5846
Epoch 7/10
 - 0s - loss: 0.6773 - accuracy: 0.5846
Epoch 8/10
 - 0s - loss: 0.6771 - accuracy: 0.5846
Epoch 9/10
 - 0s - loss: 0.6769 - accuracy: 0.5859
Epoch 10/10
 - 0s - loss: 0.6768 - accuracy: 0.5859


<keras.callbacks.callbacks.History at 0x1ec18561cf8>

In [134]:
loss, acc = model.evaluate(Xtest, ytest, verbose = 0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 65.366316


In [135]:
def prepare_data(train_docs, test_docs, mode):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_docs)
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode= mode)
    Xtest = tokenizer.texts_to_matrix(test_docs, mode= mode)
    return Xtrain, Xtest

In [136]:
def evaluate_model(Xtrain, ytrain, Xtest, ytest):
    scores = list()
    n_repeats = 30
    n_words = Xtest.shape[1]
    for i in range(n_repeats):
        model = Sequential()
        model.add(Dense(50, input_shape=(n_words,), activation='relu'))
        model.add(Dense(1, activation = 'sigmoid'))
        model.compile(loss= 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
        model.fit(Xtrain, ytrain, epochs=10, verbose=2)
        loss, acc = model.evaluate(Xtest, ytest, verbose = 0)
        scores.append(acc)
        print('Test Accuracy: %s' % (i+1), acc)
    return scores
        

In [137]:
vocab_filename = 'datasets/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

In [138]:
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)

In [139]:
from pandas import DataFrame
modes = ['binary','count','tfidf','freq']
results = DataFrame()
for mode in modes:
    Xtrain, Xtest = prepare_data(train_docs, test_docs, mode)
    results[mode] = evaluate_model(Xtrain, ytrain, Xtest, ytest)
print(results.describe())

Epoch 1/10
 - 0s - loss: 0.6826 - accuracy: 0.5800
Epoch 2/10
 - 0s - loss: 0.6796 - accuracy: 0.5849
Epoch 3/10
 - 0s - loss: 0.6786 - accuracy: 0.5849
Epoch 4/10
 - 0s - loss: 0.6780 - accuracy: 0.5849
Epoch 5/10
 - 0s - loss: 0.6775 - accuracy: 0.5846
Epoch 6/10
 - 0s - loss: 0.6776 - accuracy: 0.5852
Epoch 7/10
 - 0s - loss: 0.6772 - accuracy: 0.5852
Epoch 8/10
 - 0s - loss: 0.6773 - accuracy: 0.5852
Epoch 9/10
 - 0s - loss: 0.6770 - accuracy: 0.5846
Epoch 10/10
 - 0s - loss: 0.6770 - accuracy: 0.5807
Test Accuracy: 1 0.653472900390625
Epoch 1/10
 - 0s - loss: 0.6816 - accuracy: 0.5826
Epoch 2/10
 - 0s - loss: 0.6786 - accuracy: 0.5849
Epoch 3/10
 - 0s - loss: 0.6781 - accuracy: 0.5849
Epoch 4/10
 - 0s - loss: 0.6774 - accuracy: 0.5849
Epoch 5/10
 - 0s - loss: 0.6773 - accuracy: 0.5846
Epoch 6/10
 - 0s - loss: 0.6771 - accuracy: 0.5849
Epoch 7/10
 - 0s - loss: 0.6768 - accuracy: 0.5849
Epoch 8/10
 - 0s - loss: 0.6769 - accuracy: 0.5836
Epoch 9/10
 - 0s - loss: 0.6769 - accuracy: 0.

Epoch 1/10
 - 0s - loss: 0.6861 - accuracy: 0.5679
Epoch 2/10
 - 0s - loss: 0.6794 - accuracy: 0.5849
Epoch 3/10
 - 0s - loss: 0.6784 - accuracy: 0.5849
Epoch 4/10
 - 0s - loss: 0.6778 - accuracy: 0.5849
Epoch 5/10
 - 0s - loss: 0.6775 - accuracy: 0.5849
Epoch 6/10
 - 0s - loss: 0.6771 - accuracy: 0.5849
Epoch 7/10
 - 0s - loss: 0.6771 - accuracy: 0.5849
Epoch 8/10
 - 0s - loss: 0.6769 - accuracy: 0.5833
Epoch 9/10
 - 0s - loss: 0.6771 - accuracy: 0.5846
Epoch 10/10
 - 0s - loss: 0.6766 - accuracy: 0.5846
Test Accuracy: 16 0.6441484093666077
Epoch 1/10
 - 0s - loss: 0.6847 - accuracy: 0.5732
Epoch 2/10
 - 0s - loss: 0.6799 - accuracy: 0.5849
Epoch 3/10
 - 0s - loss: 0.6786 - accuracy: 0.5849
Epoch 4/10
 - 0s - loss: 0.6779 - accuracy: 0.5849
Epoch 5/10
 - 0s - loss: 0.6775 - accuracy: 0.5849
Epoch 6/10
 - 0s - loss: 0.6772 - accuracy: 0.5849
Epoch 7/10
 - 0s - loss: 0.6770 - accuracy: 0.5843
Epoch 8/10
 - 0s - loss: 0.6769 - accuracy: 0.5849
Epoch 9/10
 - 0s - loss: 0.6768 - accuracy: 

Epoch 1/10
 - 0s - loss: 0.6931 - accuracy: 0.5235
Epoch 2/10
 - 0s - loss: 0.6806 - accuracy: 0.5843
Epoch 3/10
 - 0s - loss: 0.6789 - accuracy: 0.5849
Epoch 4/10
 - 0s - loss: 0.6780 - accuracy: 0.5849
Epoch 5/10
 - 0s - loss: 0.6775 - accuracy: 0.5849
Epoch 6/10
 - 0s - loss: 0.6772 - accuracy: 0.5846
Epoch 7/10
 - 0s - loss: 0.6770 - accuracy: 0.5849
Epoch 8/10
 - 0s - loss: 0.6769 - accuracy: 0.5836
Epoch 9/10
 - 0s - loss: 0.6765 - accuracy: 0.5816
Epoch 10/10
 - 0s - loss: 0.6766 - accuracy: 0.5826
Test Accuracy: 1 0.6485252380371094
Epoch 1/10
 - 0s - loss: 0.6843 - accuracy: 0.5692
Epoch 2/10
 - 0s - loss: 0.6802 - accuracy: 0.5849
Epoch 3/10
 - 0s - loss: 0.6791 - accuracy: 0.5849
Epoch 4/10
 - 0s - loss: 0.6784 - accuracy: 0.5849
Epoch 5/10
 - 0s - loss: 0.6780 - accuracy: 0.5843
Epoch 6/10
 - 0s - loss: 0.6775 - accuracy: 0.5849
Epoch 7/10
 - 0s - loss: 0.6774 - accuracy: 0.5849
Epoch 8/10
 - 0s - loss: 0.6770 - accuracy: 0.5843
Epoch 9/10
 - 0s - loss: 0.6773 - accuracy: 0

Epoch 1/10
 - 0s - loss: 0.6907 - accuracy: 0.5300
Epoch 2/10
 - 0s - loss: 0.6793 - accuracy: 0.5849
Epoch 3/10
 - 0s - loss: 0.6781 - accuracy: 0.5849
Epoch 4/10
 - 0s - loss: 0.6776 - accuracy: 0.5849
Epoch 5/10
 - 0s - loss: 0.6774 - accuracy: 0.5849
Epoch 6/10
 - 0s - loss: 0.6770 - accuracy: 0.5849
Epoch 7/10
 - 0s - loss: 0.6770 - accuracy: 0.5823
Epoch 8/10
 - 0s - loss: 0.6767 - accuracy: 0.5849
Epoch 9/10
 - 0s - loss: 0.6768 - accuracy: 0.5843
Epoch 10/10
 - 0s - loss: 0.6769 - accuracy: 0.5839
Test Accuracy: 16 0.655756413936615
Epoch 1/10
 - 0s - loss: 0.6837 - accuracy: 0.5767
Epoch 2/10
 - 0s - loss: 0.6796 - accuracy: 0.5849
Epoch 3/10
 - 0s - loss: 0.6786 - accuracy: 0.5849
Epoch 4/10
 - 0s - loss: 0.6778 - accuracy: 0.5849
Epoch 5/10
 - 0s - loss: 0.6775 - accuracy: 0.5849
Epoch 6/10
 - 0s - loss: 0.6772 - accuracy: 0.5849
Epoch 7/10
 - 0s - loss: 0.6771 - accuracy: 0.5849
Epoch 8/10
 - 0s - loss: 0.6769 - accuracy: 0.5849
Epoch 9/10
 - 0s - loss: 0.6769 - accuracy: 0

Epoch 1/10
 - 0s - loss: 0.6921 - accuracy: 0.5568
Epoch 2/10
 - 0s - loss: 0.6817 - accuracy: 0.5816
Epoch 3/10
 - 0s - loss: 0.6789 - accuracy: 0.5833
Epoch 4/10
 - 0s - loss: 0.6789 - accuracy: 0.5800
Epoch 5/10
 - 0s - loss: 0.6778 - accuracy: 0.5846
Epoch 6/10
 - 0s - loss: 0.6781 - accuracy: 0.5843
Epoch 7/10
 - 0s - loss: 0.6774 - accuracy: 0.5823
Epoch 8/10
 - 0s - loss: 0.6780 - accuracy: 0.5816
Epoch 9/10
 - 0s - loss: 0.6779 - accuracy: 0.5849
Epoch 10/10
 - 0s - loss: 0.6780 - accuracy: 0.5856
Test Accuracy: 1 0.6490960717201233
Epoch 1/10
 - 0s - loss: 0.6879 - accuracy: 0.5787
Epoch 2/10
 - 0s - loss: 0.6807 - accuracy: 0.5856
Epoch 3/10
 - 0s - loss: 0.6794 - accuracy: 0.5836
Epoch 4/10
 - 0s - loss: 0.6788 - accuracy: 0.5833
Epoch 5/10
 - 0s - loss: 0.6781 - accuracy: 0.5820
Epoch 6/10
 - 0s - loss: 0.6780 - accuracy: 0.5833
Epoch 7/10
 - 0s - loss: 0.6782 - accuracy: 0.5839
Epoch 8/10
 - 0s - loss: 0.6785 - accuracy: 0.5862
Epoch 9/10
 - 0s - loss: 0.6777 - accuracy: 0

Epoch 1/10
 - 0s - loss: 0.6997 - accuracy: 0.5431
Epoch 2/10
 - 0s - loss: 0.6822 - accuracy: 0.5735
Epoch 3/10
 - 0s - loss: 0.6796 - accuracy: 0.5797
Epoch 4/10
 - 0s - loss: 0.6790 - accuracy: 0.5807
Epoch 5/10
 - 0s - loss: 0.6781 - accuracy: 0.5846
Epoch 6/10
 - 0s - loss: 0.6784 - accuracy: 0.5833
Epoch 7/10
 - 0s - loss: 0.6779 - accuracy: 0.5872
Epoch 8/10
 - 0s - loss: 0.6782 - accuracy: 0.5784
Epoch 9/10
 - 0s - loss: 0.6778 - accuracy: 0.5849
Epoch 10/10
 - 0s - loss: 0.6775 - accuracy: 0.5826
Test Accuracy: 16 0.639200747013092
Epoch 1/10
 - 0s - loss: 0.7077 - accuracy: 0.5222
Epoch 2/10
 - 0s - loss: 0.6819 - accuracy: 0.5833
Epoch 3/10
 - 0s - loss: 0.6795 - accuracy: 0.5849
Epoch 4/10
 - 0s - loss: 0.6783 - accuracy: 0.5833
Epoch 5/10
 - 0s - loss: 0.6782 - accuracy: 0.5843
Epoch 6/10
 - 0s - loss: 0.6777 - accuracy: 0.5820
Epoch 7/10
 - 0s - loss: 0.6779 - accuracy: 0.5872
Epoch 8/10
 - 0s - loss: 0.6777 - accuracy: 0.5830
Epoch 9/10
 - 0s - loss: 0.6781 - accuracy: 0

Epoch 1/10
 - 0s - loss: 0.6811 - accuracy: 0.5836
Epoch 2/10
 - 0s - loss: 0.6796 - accuracy: 0.5849
Epoch 3/10
 - 0s - loss: 0.6787 - accuracy: 0.5849
Epoch 4/10
 - 0s - loss: 0.6785 - accuracy: 0.5849
Epoch 5/10
 - 0s - loss: 0.6780 - accuracy: 0.5849
Epoch 6/10
 - 0s - loss: 0.6774 - accuracy: 0.5849
Epoch 7/10
 - 0s - loss: 0.6774 - accuracy: 0.5849
Epoch 8/10
 - 0s - loss: 0.6772 - accuracy: 0.5849
Epoch 9/10
 - 0s - loss: 0.6772 - accuracy: 0.5843
Epoch 10/10
 - 0s - loss: 0.6768 - accuracy: 0.5849
Test Accuracy: 1 0.655756413936615
Epoch 1/10
 - 0s - loss: 0.6856 - accuracy: 0.5607
Epoch 2/10
 - 0s - loss: 0.6795 - accuracy: 0.5833
Epoch 3/10
 - 0s - loss: 0.6782 - accuracy: 0.5849
Epoch 4/10
 - 0s - loss: 0.6778 - accuracy: 0.5849
Epoch 5/10
 - 0s - loss: 0.6775 - accuracy: 0.5862
Epoch 6/10
 - 0s - loss: 0.6773 - accuracy: 0.5843
Epoch 7/10
 - 0s - loss: 0.6772 - accuracy: 0.5820
Epoch 8/10
 - 0s - loss: 0.6770 - accuracy: 0.5852
Epoch 9/10
 - 0s - loss: 0.6770 - accuracy: 0.

Epoch 1/10
 - 0s - loss: 0.6824 - accuracy: 0.5790
Epoch 2/10
 - 0s - loss: 0.6788 - accuracy: 0.5833
Epoch 3/10
 - 0s - loss: 0.6783 - accuracy: 0.5849
Epoch 4/10
 - 0s - loss: 0.6775 - accuracy: 0.5846
Epoch 5/10
 - 0s - loss: 0.6773 - accuracy: 0.5852
Epoch 6/10
 - 0s - loss: 0.6772 - accuracy: 0.5849
Epoch 7/10
 - 0s - loss: 0.6770 - accuracy: 0.5849
Epoch 8/10
 - 0s - loss: 0.6768 - accuracy: 0.5849
Epoch 9/10
 - 0s - loss: 0.6767 - accuracy: 0.5823
Epoch 10/10
 - 0s - loss: 0.6767 - accuracy: 0.5843
Test Accuracy: 16 0.655756413936615
Epoch 1/10
 - 0s - loss: 0.6836 - accuracy: 0.5745
Epoch 2/10
 - 0s - loss: 0.6789 - accuracy: 0.5849
Epoch 3/10
 - 0s - loss: 0.6783 - accuracy: 0.5849
Epoch 4/10
 - 0s - loss: 0.6777 - accuracy: 0.5849
Epoch 5/10
 - 0s - loss: 0.6776 - accuracy: 0.5852
Epoch 6/10
 - 0s - loss: 0.6774 - accuracy: 0.5849
Epoch 7/10
 - 0s - loss: 0.6770 - accuracy: 0.5849
Epoch 8/10
 - 0s - loss: 0.6767 - accuracy: 0.5836
Epoch 9/10
 - 0s - loss: 0.6770 - accuracy: 0

          binary      count      tfidf       freq
count  30.000000  30.000000  30.000000  30.000000
mean    0.652445   0.653479   0.646445   0.653473
std     0.003988   0.003031   0.005957   0.003897
min     0.643387   0.643958   0.638820   0.638820
25%     0.650666   0.653330   0.639201   0.653283
50%     0.653663   0.653663   0.649096   0.654710
75%     0.655756   0.655756   0.649096   0.655756
max     0.656708   0.656518   0.654234   0.656137


In [140]:
def predict_sentiment(review, vocab, tokenizer, model):
    tokens = clean_doc(review)
    tokens = [w for w in tokens if w in vocab]
    line = ' '.join(tokens)
    encoded = tokenizer.texts_to_matrix([line], mode='binary')
    yhat = model.predict(encoded, verbose = 0)
    percent_pos = yhat[0,0]
    if round(percent_pos) == 0:
        return (1-percent_pos), 'NEGATIVE'
    return percent_pos, 'POSITIVE'

In [141]:
model = define_model(n_words)
model.fit(Xtrain, ytrain, epochs= 10, verbose = 2)

Model: "sequential_245"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_488 (Dense)            (None, 50)                1400      
_________________________________________________________________
dense_489 (Dense)            (None, 1)                 51        
Total params: 1,451
Trainable params: 1,451
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
 - 0s - loss: 0.6857 - accuracy: 0.5591
Epoch 2/10
 - 0s - loss: 0.6789 - accuracy: 0.5846
Epoch 3/10
 - 0s - loss: 0.6783 - accuracy: 0.5849
Epoch 4/10
 - 0s - loss: 0.6778 - accuracy: 0.5849
Epoch 5/10
 - 0s - loss: 0.6774 - accuracy: 0.5849
Epoch 6/10
 - 0s - loss: 0.6772 - accuracy: 0.5849
Epoch 7/10
 - 0s - loss: 0.6771 - accuracy: 0.5849
Epoch 8/10
 - 0s - loss: 0.6772 - accuracy: 0.5839
Epoch 9/10
 - 0s - loss: 0.6769 - accuracy: 0.5849
Epoch 10/10
 - 0s - loss: 0.6769 - accuracy: 0.5846


<keras.callbacks.callbacks.History at 0x1ec1b0a1630>

In [142]:
text = 'Best moview ever! It was great, I recoomend it.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)

In [143]:
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [Best moview ever! It was great, I recoomend it.]
Sentiment: POSITIVE (57.154%)
