In [1]:
#### Importing all necessary python Modules ######

from os import listdir
from string import punctuation
from collections import Counter

from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn import metrics 

import joblib

from sklearn.naive_bayes import MultinomialNB 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [2]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '',punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens


filename = 'dataset/pos/cv000_29590.txt'
text = load_doc(filename)

print(text)

tokens = clean_doc(text)

print(tokens)

films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . 
for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen . 
to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . 
the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . 
in other words , don't dismiss this film because of its source . 
if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes . 
getting the hughes brothers to direct this seems almost as 

In [3]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
	# load the doc
	doc = load_doc(filename)
	
    # clean doc
	tokens = clean_doc(doc)
	
    # filter by vocab
	tokens = [w for w in tokens if w in vocab]

	return ' '.join(tokens)

In [4]:
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# update counts
	vocab.update(tokens)
    
# load all docs in a directory
def process_docs(directory, vocab, is_trian):
	lines = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip any reviews in the test set
		if is_trian and filename.startswith('cv9'):
			continue
		if not is_trian and not filename.startswith('cv9'):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load and clean the doc
		line = doc_to_line(path, vocab)
		# add to list
		lines.append(line)
        # add doc to vocab
		add_doc_to_vocab(path, vocab)
	return lines

In [5]:

# define vocab
vocab = Counter()

# add all docs to vocab
process_docs('dataset/pos', vocab, True)
process_docs('dataset/neg', vocab, True)

# print the size of the vocab
print(len(vocab))

# print the top words in the vocab
print(vocab.most_common(50))

# keep tokens with a min occurrence
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]

print(len(tokens))

44276
[('film', 7983), ('one', 4946), ('movie', 4826), ('like', 3201), ('even', 2262), ('good', 2080), ('time', 2041), ('story', 1907), ('films', 1873), ('would', 1844), ('much', 1824), ('also', 1757), ('characters', 1735), ('get', 1724), ('character', 1703), ('two', 1643), ('first', 1588), ('see', 1557), ('way', 1515), ('well', 1511), ('make', 1418), ('really', 1407), ('little', 1351), ('life', 1334), ('plot', 1288), ('people', 1269), ('bad', 1248), ('could', 1248), ('scene', 1241), ('movies', 1238), ('never', 1201), ('best', 1179), ('new', 1140), ('scenes', 1135), ('man', 1131), ('many', 1130), ('doesnt', 1118), ('know', 1092), ('dont', 1086), ('hes', 1024), ('great', 1014), ('another', 992), ('action', 985), ('love', 977), ('us', 967), ('go', 952), ('director', 948), ('end', 946), ('something', 945), ('still', 936)]
25767


In [6]:

# save list to file
def save_list(lines, filename):
	# convert lines to a single blob of text
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()

# save tokens to a vocabulary file
save_list(tokens, 'dataset/vocab.txt')

In [7]:
# load the vocabulary
vocab = load_doc('dataset/vocab.txt')

vocab = vocab.split()

vocab = set(vocab)

# load all training reviews
positive_lines = process_docs('dataset/pos', vocab, True)

negative_lines = process_docs('dataset/neg', vocab, True)

print(positive_lines[0])

words = negative_lines + positive_lines

y = [0 for i in range(0,900)] + [1 for i in range(0,900)]

print(len(positive_lines), len(negative_lines))

trailing success brit humour movie industry likes commitments nearly slapstick fish called wanda full monty one film delivered depth former humour magnitude latter film opens narrated documentary reel showing improving economic living standards sheffield cut harsh reality present sheffield become sort visible increase anything amount flourishing industry gaz carlyle spends time workers club sort place jobless people sit around wait job offers plump friend dave addy well former foreman gerald wilkinson sitting around club months without call duty seemingly passing hard time gaz suddenly transformed desperation cannot afford pounds money exwife suddenly finding facing losing custody son goes concoct enterprising get money desperately requires dave gerald problems top lack employment dave also faced paranoia wife leaving current financial state well appearance gerald cheating wife telling layoff long months leaving home work actually longstanding member workers club people real problems s

In [8]:
###### Bag of Word Model ######

vect = CountVectorizer()

vect.fit(words)

tmp = list(vect.vocabulary_.items())

tmp.sort(reverse = True)

print(tmp[0:20])

bgw = vect.transform(words)

bgw = bgw.toarray()

print('',bgw,sep='\n')

[('zycie', 25766), ('zwigoffs', 25765), ('zwicks', 25764), ('zwick', 25763), ('zundel', 25762), ('zuko', 25761), ('zuehlke', 25760), ('zucker', 25759), ('zs', 25758), ('zorro', 25757), ('zorg', 25756), ('zoot', 25755), ('zooms', 25754), ('zoomins', 25753), ('zooming', 25752), ('zoom', 25751), ('zoologist', 25750), ('zoolander', 25749), ('zoo', 25748), ('zones', 25747)]

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [9]:
# splitting X and y into training and testing sets 

X_train, X_test, y_train, y_test = train_test_split(bgw, y, test_size=0.3, random_state=0) 

mnb = MultinomialNB() 

knc = KNeighborsClassifier()

rf = RandomForestClassifier()

lr = LogisticRegression()


knc.fit(X_train, y_train)

rf.fit(X_train, y_train)

lr.fit(X_train, y_train)

mnb.fit(X_train, y_train) 


y_pred1 = mnb.predict(X_test) 

y_pred2 = knc.predict(X_test)

y_pred3 = rf.predict(X_test)

y_pred4 = lr.predict(X_test) 

In [10]:

# comparing actual response values (y_test) with predicted response values (y_pred) 

print("MultinomialNB model accuracy(in %):", metrics.accuracy_score(y_test, y_pred1)*100)

print("KNeighborsClassifier model accuracy(in %):", metrics.accuracy_score(y_test, y_pred2)*100)

print("RandomForestClassifier model accuracy(in %):", metrics.accuracy_score(y_test, y_pred3)*100)

print("LogisticRegression model accuracy(in %):", metrics.accuracy_score(y_test, y_pred4)*100)

MultinomialNB model accuracy(in %): 80.18518518518518
KNeighborsClassifier model accuracy(in %): 53.888888888888886
RandomForestClassifier model accuracy(in %): 82.22222222222221
LogisticRegression model accuracy(in %): 84.25925925925925


In [11]:
# Save the model as a pickle in a file 

joblib.dump(mnb, 'model/MultinomialNB_bgw.pkl') 

joblib.dump(knc, 'model/KNeighborsClassifier_bgw.pkl') 

joblib.dump(rf, 'model/RandomForestClassifier_bgw.pkl') 

joblib.dump(lr, 'model/LogisticRegression_bgw.pkl')

['model/LogisticRegression_bgw.pkl']

In [12]:
###### term frequency–inverse document frequency (Tf-idf) Model ######

vect = TfidfVectorizer()

vect.fit(words)

tmp = list(vect.vocabulary_.items())

tmp.sort(reverse = True)

print(tmp[0:20])

tfidf = vect.transform(words)

tfidf = tfidf.toarray()

print('',tfidf,sep='\n')

[('zycie', 25766), ('zwigoffs', 25765), ('zwicks', 25764), ('zwick', 25763), ('zundel', 25762), ('zuko', 25761), ('zuehlke', 25760), ('zucker', 25759), ('zs', 25758), ('zorro', 25757), ('zorg', 25756), ('zoot', 25755), ('zooms', 25754), ('zoomins', 25753), ('zooming', 25752), ('zoom', 25751), ('zoologist', 25750), ('zoolander', 25749), ('zoo', 25748), ('zones', 25747)]

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [13]:
# splitting X and y into training and testing sets 

X_train, X_test, y_train, y_test = train_test_split(tfidf, y, test_size=0.3, random_state=0) 

mnb = MultinomialNB() 

knc = KNeighborsClassifier()

rf = RandomForestClassifier()

lr = LogisticRegression()


knc.fit(X_train, y_train)

rf.fit(X_train, y_train)

lr.fit(X_train, y_train)

mnb.fit(X_train, y_train) 


y_pred1 = mnb.predict(X_test) 

y_pred2 = knc.predict(X_test)

y_pred3 = rf.predict(X_test)

y_pred4 = lr.predict(X_test) 

In [14]:
# comparing actual response values (y_test) with predicted response values (y_pred) 

print("MultinomialNB model accuracy(in %):", metrics.accuracy_score(y_test, y_pred1)*100)

print("KNeighborsClassifier model accuracy(in %):", metrics.accuracy_score(y_test, y_pred2)*100)

print("RandomForestClassifier model accuracy(in %):", metrics.accuracy_score(y_test, y_pred3)*100)

print("LogisticRegression model accuracy(in %):", metrics.accuracy_score(y_test, y_pred4)*100)

MultinomialNB model accuracy(in %): 82.5925925925926
KNeighborsClassifier model accuracy(in %): 66.66666666666666
RandomForestClassifier model accuracy(in %): 78.70370370370371
LogisticRegression model accuracy(in %): 83.51851851851852


In [15]:
# Save the model as a pickle in a file 

joblib.dump(mnb, 'model/MultinomialNB_tfidf.pkl') 

joblib.dump(knc, 'model/KNeighborsClassifier_tfidf.pkl') 

joblib.dump(rf, 'model/RandomForestClassifier_tfidf.pkl') 

joblib.dump(lr, 'model/LogisticRegression_tfidf.pkl') 

['model/LogisticRegression_tfidf.pkl']