In [1]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens

# load doc, clean and return line of tokens. Returns the words in both the file and in the vocab
def doc_to_line(filename, vocab):
	# load the doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# filter by vocab
	tokens = [w for w in tokens if w in vocab]
	return ' '.join(tokens)

# load all docs in a directory
def process_docs(directory, vocab):
	lines = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip any reviews in the test set
		if filename.startswith('cv9'):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load and clean the doc
		line = doc_to_line(path, vocab)
		# add to list
		lines.append(line)
	return lines

# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# load all training reviews
positive_lines = process_docs('/Users/wildercrosier/Desktop/review_polarity/txt_sentoken/pos', vocab)
negative_lines = process_docs('/Users/wildercrosier/Desktop/review_polarity/txt_sentoken/neg', vocab)
# summarize what we have
print(len(positive_lines), len(negative_lines))
print(len(vocab))

900 900
13893


In [2]:
docs = positive_lines + negative_lines

In [3]:
from keras.preprocessing.text import Tokenizer
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(docs)

In [4]:
# encode training data set
Xtrain = tokenizer.texts_to_matrix(docs, mode='freq')
print(Xtrain.shape) #the extra element in the 2nd dimention of xtrain is zero which is a reserved index and
                    #not assigned to any word: https://keras.io/api/preprocessing/text/#tokenizer

(1800, 13894)


In [5]:
# load all docs in a directory
def process_docs(directory, vocab, is_trian):
	lines = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip any reviews in the test set
		if is_trian and filename.startswith('cv9'):
			continue
		if not is_trian and not filename.startswith('cv9'):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load and clean the doc
		line = doc_to_line(path, vocab)
		lines.append(line)
	return lines

pos_lines = process_docs('/Users/wildercrosier/Desktop/review_polarity/txt_sentoken/pos', vocab, False)
neg_lines = process_docs('/Users/wildercrosier/Desktop/review_polarity/txt_sentoken/neg', vocab, False)
test_docs = neg_lines + pos_lines
# encode the TEST data set
Xtest = tokenizer.texts_to_matrix(test_docs, mode='freq')
print(Xtest.shape)

(200, 13894)


In [6]:
import numpy as np
num_words = Xtest.shape[1]
ytrain = np.array([0 for _ in range(900)] + [1 for _ in range(900)]) #makes a vector, 900+900 elements long, with 1 or 0
                                                                    # where 1 means a pos review and 0 means a neg review
ytest = np.array([0 for _ in range(100)] + [1 for _ in range(100)])

In [7]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
model = Sequential()
model.add(Dense(50, input_shape=(num_words,), activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [8]:
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def create_baseline():
	model = Sequential()
	model.add(Dense(10, input_shape=(num_words,), activation='relu'))
	model.add(Dense(1, activation='sigmoid'))

	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

estimator = KerasClassifier(build_fn=create_baseline, epochs=50, batch_size=5, verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(estimator, Xtrain, ytrain, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))



Baseline: 85.94% (3.60%)
