In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from string import punctuation
import string
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

In [6]:
#loading doc to memory
def load_doc(filename):
    file = open(filename, 'r') #read only
    text = file.read() # reading all text
    file.close() #close the file
    return text

In [7]:
def clean_doc(doc):
    #split into tokens by white space
    tokens = doc.split()
    #remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    #removing othen non alphabetic tokens
    tokens = [w for w in tokens if w.isalpha()]
    #filter out stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    #filter out short tokens
    tokens = [w for w in tokens if len(w)>1]
    #the list of token has 
    return tokens

# load the document
filename = 'txt_sentoken/pos/cv000_29590.txt'
text = load_doc(filename)
tokens = clean_doc(text)
print(tokens)

['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'whether', 'theyre', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'toward', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'theres', 'never', 'really', 'comic', 'book', 'like', 'hell', 'starters', 'created', 'alan', 'moore', 'eddie', 'campbell', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'series', 'called', 'watchmen', 'say', 'moore', 'campbell', 'thoroughly', 'researched', 'subject', 'jack', 'ripper', 'would', 'like', 'saying', 'michael', 'jackson', 'starting', 'look', 'little', 'odd', 'book', 'graphic', 'novel', 'pages', 'long', 'includes', 'nearly', 'consist', 'nothing', 'footnotes', 'words', 'dont', 'dismiss', 'film', 'source', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'another', 'stumbling', 'block', 'hells', 'directors', 'albert', 'allen', 'hughes', 'getting', 'hughes', 'brothers', 'direct', 'seems', 'almost', 'ludicrous', 'casting', 'carrot', 'top', 'well', 'anythi

In [8]:
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
    # load doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # update counts
    vocab.update(tokens)

In [9]:
# load all docs in a directory
def process_docs(directory, vocab, is_trian):
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_trian and filename.startswith('cv9'):
            continue
        if not is_trian and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # add doc to vocab
        add_doc_to_vocab(path, vocab)

In [10]:
#define vocab
vocab = Counter()
#add all docs to vocab
process_docs('txt_sentoken/neg', vocab, True)
process_docs('txt_sentoken/pos', vocab, True)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))

44276
[('film', 7983), ('one', 4946), ('movie', 4826), ('like', 3201), ('even', 2262), ('good', 2080), ('time', 2041), ('story', 1907), ('films', 1873), ('would', 1844), ('much', 1824), ('also', 1757), ('characters', 1735), ('get', 1724), ('character', 1703), ('two', 1643), ('first', 1588), ('see', 1557), ('way', 1515), ('well', 1511), ('make', 1418), ('really', 1407), ('little', 1351), ('life', 1334), ('plot', 1288), ('people', 1269), ('bad', 1248), ('could', 1248), ('scene', 1241), ('movies', 1238), ('never', 1201), ('best', 1179), ('new', 1140), ('scenes', 1135), ('man', 1131), ('many', 1130), ('doesnt', 1118), ('know', 1092), ('dont', 1086), ('hes', 1024), ('great', 1014), ('another', 992), ('action', 985), ('love', 977), ('us', 967), ('go', 952), ('director', 948), ('end', 946), ('something', 945), ('still', 936)]


In [11]:
'''We can step through the vocabulary and remove all words that have a
low occurrence, such as only being used once or twice in all reviews.'''
min_occurance = 2
tokens = [k for k,c in vocab.items() if c>=min_occurance]
print(len(tokens))

25767


In [12]:
#vocab.txt can be saved and used later for review prediction prior to encode them for modelling
# save list to file
def save_list(lines, filename):
    #convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()

# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

In [13]:
#load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [14]:
#turn a doc into clean tokens based on words in vocab
def clean_vdoc(doc, vocab):
    token = doc.split()
    #remove punctuatuion from each token
    table = str.maketrans('', '', punctuation)
    token = [w.translate(table) for w in token]
    #filter out token not in vocab
    token = [w for w in token if w in vocab]
    token = ' '.join(token)
    return token

In [15]:
#load all docs in a directory
def process_vdocs(directory, vocab, is_trian):
    documents = list()
    #walk through all the files in the folder
    for filename in listdir(directory):
        if is_trian and filename.startswith('cv9'):
            continue
        if not is_trian and filename.startswith('cv9'):
            continue
        #create the full path of the file to open
        path = directory + '/' + filename
        #load the doc
        doc = load_doc(path)
        #clean doc
        tokens = clean_vdoc(doc, vocab)
        #add to the list
        documents.append(tokens)
    return documents

#load all training reviews
positive_docs = process_vdocs('txt_sentoken/pos', vocab, True)
negative_docs = process_vdocs('txt_sentoken/neg', vocab, True)
train_docs = negative_docs + positive_docs

In [16]:
from keras.preprocessing.text import Tokenizer
#create the tokenizer
tokenizer = Tokenizer()
#fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

In [17]:
#sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)

In [18]:
#pad sequences
import tensorflow as tf
seq = tf.keras.preprocessing.sequence
max_length = max([len(s.split()) for s in train_docs])
Xtrain = seq.pad_sequences(encoded_docs, maxlen = max_length, padding = 'post')

In [19]:
#define training labels
ytrain = np.array([0 for _ in range(900)] + [1 for _ in range(900)])

In [20]:
# load all test reviews
positive_docs = process_vdocs('txt_sentoken/pos', vocab, False)
negative_docs = process_vdocs('txt_sentoken/neg', vocab, False)
test_docs = negative_docs + positive_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = seq.pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = np.array([0 for _ in range(900)] + [1 for _ in range(900)])

In [21]:
#We are now ready to define our neural network model. Embedding Layer

In [22]:
#define vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [23]:
'''We use a Convolutional Neural Network (CNN) as they have proven
to be successful at document classification problems.
A conservative CNN configuration is used with 32 filters (parallel fields for processing words)
and a kernel size of 8 with a rectified linear (‘relu’) activation function.
This is followed by a pooling layer that reduces the output of the convolutional layer
by half.

Next, the 2D output from the CNN part of the model is flattened to one long 2D
vector to represent the ‘features’ extracted by the CNN. The back-end of the
model is a standard Multilayer Perceptron layers to interpret the CNN features.
The output layer uses a sigmoid activation function to output a value between
0 and 1 for the negative and positive sentiment in the review.'''

'We use a Convolutional Neural Network (CNN) as they have proven\nto be successful at document classification problems.\nA conservative CNN configuration is used with 32 filters (parallel fields for processing words)\nand a kernel size of 8 with a rectified linear (‘relu’) activation function.\nThis is followed by a pooling layer that reduces the output of the convolutional layer\nby half.\n\nNext, the 2D output from the CNN part of the model is flattened to one long 2D\nvector to represent the ‘features’ extracted by the CNN. The back-end of the\nmodel is a standard Multilayer Perceptron layers to interpret the CNN features.\nThe output layer uses a sigmoid activation function to output a value between\n0 and 1 for the negative and positive sentiment in the review.'

In [63]:
#define model 
import tensorflow
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, Conv1D, MaxPooling1D
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length = max_length))
model.add(Conv1D(filters = 32, kernel_size = 8, activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))
model.add(Flatten())
model.add(Dense(10, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1317, 100)         2576800   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1310, 32)          25632     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 655, 32)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 20960)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                209610    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 11        
Total params: 2,812,053
Trainable params: 2,812,053
Non-trainable params: 0
____________________________________________

In [25]:
#complie network
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
#fitting the model
model.fit(Xtrain, ytrain, epochs = 10, verbose = 2)

Epoch 1/10
57/57 - 8s - loss: 0.6903 - accuracy: 0.5150
Epoch 2/10
57/57 - 8s - loss: 0.6172 - accuracy: 0.6611
Epoch 3/10
57/57 - 7s - loss: 0.2480 - accuracy: 0.9161
Epoch 4/10
57/57 - 7s - loss: 0.0257 - accuracy: 0.9956
Epoch 5/10
57/57 - 7s - loss: 0.0043 - accuracy: 1.0000
Epoch 6/10
57/57 - 7s - loss: 0.0021 - accuracy: 1.0000
Epoch 7/10
57/57 - 7s - loss: 0.0014 - accuracy: 1.0000
Epoch 8/10
57/57 - 7s - loss: 0.0011 - accuracy: 1.0000
Epoch 9/10
57/57 - 7s - loss: 8.5853e-04 - accuracy: 1.0000
Epoch 10/10
57/57 - 7s - loss: 7.0608e-04 - accuracy: 1.0000


<tensorflow.python.keras.callbacks.History at 0x21635eb5988>

In [26]:
#accuracy check
loss, acc = model.evaluate(Xtest, ytest, verbose = 0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 100.000000


In [27]:
#Another Word2Vec method of Word embedding, much more popular

In [28]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [39]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [41]:
# turn a doc into clean tokens
def doc_to_clean_lines(doc, vocab):
    clean_lines = list()
    lines = doc.splitlines()
    for line in lines:
        # split into tokens by white space
        tokens = line.split()
        # remove punctuation from each token
        table = str.maketrans('', '', punctuation)
        tokens = [w.translate(table) for w in tokens]
        # filter out tokens not in vocab
        tokens = [w for w in tokens if w in vocab]
        clean_lines.append(tokens)
    return clean_lines

In [42]:
# load all docs in a directory
def process_docs(directory, vocab, is_trian):
    lines = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_trian and filename.startswith('cv9'):
            continue
        if not is_trian and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        doc = load_doc(path)
        doc_lines = doc_to_clean_lines(doc, vocab)
        # add lines to list
        lines += doc_lines
    return lines

In [43]:
# load training data
positive_docs = process_docs('txt_sentoken/pos', vocab, True)
negative_docs = process_docs('txt_sentoken/neg', vocab, True)
sentences = negative_docs + positive_docs
print('Total training sentences: %d' % len(sentences))

Total training sentences: 58109


In [52]:
from gensim.models import Word2Vec
# train word2vec model
model = Word2Vec(sentences, vector_size=100, window=5, workers=8, min_count=1)
# summarize vocabulary size in model
words = list(model.wv.index_to_key)
print('Vocabulary size: %d' % len(words))

Vocabulary size: 25767


In [53]:
# save model in ASCII (word2vec) format
filename = 'embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [54]:
# load embedding as a dict
def load_embedding(filename):
    # load embedding into memory, skip first line
    file = open(filename,'r')
    lines = file.readlines()[1:]
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding

In [59]:
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, 100))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        weight_matrix[i] = embedding.get(word)
    return weight_matrix

In [60]:
from numpy import asarray
# load embedding from file
raw_embedding = load_embedding('embedding_word2vec.txt')
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)
# create the embedding layer
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=max_length, trainable=False)

In [61]:
# define model
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1317, 100)         2576800   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1313, 128)         64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 656, 128)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 83968)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 83969     
Total params: 2,724,897
Trainable params: 148,097
Non-trainable params: 2,576,800
_________________________________________________________________
None


In [62]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Epoch 1/10
57/57 - 6s - loss: 0.7054 - accuracy: 0.5467
Epoch 2/10
57/57 - 6s - loss: 0.6628 - accuracy: 0.5922
Epoch 3/10
57/57 - 6s - loss: 0.5836 - accuracy: 0.7017
Epoch 4/10
57/57 - 6s - loss: 0.4714 - accuracy: 0.7872
Epoch 5/10
57/57 - 6s - loss: 0.3369 - accuracy: 0.8672
Epoch 6/10
57/57 - 6s - loss: 0.2320 - accuracy: 0.9328
Epoch 7/10
57/57 - 6s - loss: 0.1289 - accuracy: 0.9700
Epoch 8/10
57/57 - 6s - loss: 0.0761 - accuracy: 0.9944
Epoch 9/10
57/57 - 6s - loss: 0.0415 - accuracy: 0.9978
Epoch 10/10
57/57 - 6s - loss: 0.0253 - accuracy: 1.0000
Test Accuracy: 100.000000
