In [0]:
import os
os.chdir("drive/My Drive/sentiment")
os.getcwd()


In [0]:
import nltk
# nltk.download('averaged_perceptron_tagger')
import random
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode

In [0]:
class VoteClassifier(ClassifierI):
  def __init__(self, *classifiers):
    self._classifiers = classifiers
  def classify(self, features):
    votes = []
    for classifier in self._classifiers:
      vote = classifier.classify(features)
      votes.append(vote)
      return mode(votes)
  def confidence(self, features):
    votes = []
    for classifier in self._classifiers:
      vote = classifier.classify(features)
      votes.append(vote)
      return votes.count(mode(votes)) / len(votes)    

In [0]:
pos_reviews = open("positive.txt","r",encoding = "latin-1").read()
neg_reviews = open("negative.txt","r",encoding = "latin-1").read()
documents = []
for review in pos_reviews.split("\n"):
  documents.append((review, "pos"))
for review in neg_reviews.split("\n"):
  documents.append((review, "neg"))


In [68]:
word_set = []
pos_words = nltk.word_tokenize(pos_reviews)
pos_p = nltk.pos_tag(pos_words)
neg_words = nltk.word_tokenize(neg_reviews)
pos_n = nltk.pos_tag(neg_words)
for word, tag in pos_p:
  if tag[0] == "J":
    word_set.append(word.lower())
for word, tag in pos_n:
  if tag[0] == "J":
    word_set.append(word.lower())
print(word_set[:20])

['21st', 'new', 'conan', 'greater', 'jean-claud', 'steven', 'elaborate', 'huge', 'expanded', 'effective', 'too-tepid', 'good', 'rare', 'honest', 'great', 'neurotic', 'absolute', 'good', 'snappy', 'clever']


In [69]:
word_freq = nltk.FreqDist(word_set)
word_features = list(word_freq.keys())[:5000]
print(word_features[:5])
print(word_freq["new"])

['21st', 'new', 'conan', 'greater', 'jean-claud']
206


In [0]:
import pickle
save_features = open("word_features.pickle","wb")
pickle.dump(word_features, save_features)
save_features.close()

In [0]:
def get_features(doc):
  word_set = set(nltk.word_tokenize(doc))
  features = {}
  for word in word_features:
    features[word] = (word in word_set)
  return features
featureset = [(get_features(doc), category) for doc, category in documents]
#featureset[0]

In [0]:
random.shuffle(featureset)
#featureset[0]

In [73]:
train = featureset[:10000]
test = featureset[10000:]
classifier = nltk.NaiveBayesClassifier.train(train)
print(f"Accuracy : {nltk.classify.accuracy(classifier, test)*100}")

Accuracy : 72.43975903614458


In [74]:
classifier.show_most_informative_features()

Most Informative Features
              engrossing = True              pos : neg    =     20.4 : 1.0
                mediocre = True              neg : pos    =     15.6 : 1.0
                 generic = True              neg : pos    =     15.0 : 1.0
              refreshing = True              pos : neg    =     14.4 : 1.0
                    flat = True              neg : pos    =     14.2 : 1.0
                  boring = True              neg : pos    =     14.1 : 1.0
               inventive = True              pos : neg    =     13.0 : 1.0
                    dull = True              neg : pos    =     12.4 : 1.0
                    warm = True              pos : neg    =     12.2 : 1.0
               wonderful = True              pos : neg    =     11.8 : 1.0


In [0]:
save_classifier = open("NaiveBayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [76]:
multi = SklearnClassifier(MultinomialNB())
multi.train(train)
print(f"Accuracy : {nltk.classify.accuracy(multi, test)*100}")

Accuracy : 72.28915662650603


In [0]:
save_classifier = open("MultiNB.pickle","wb")
pickle.dump(multi, save_classifier)
save_classifier.close()

In [78]:
bernoulli = SklearnClassifier(BernoulliNB())
bernoulli.train(train)
print(f"Accuracy : {nltk.classify.accuracy(bernoulli, test)*100}")

Accuracy : 72.43975903614458


In [0]:
save_classifier = open("BernoulliNB.pickle","wb")
pickle.dump(bernoulli, save_classifier)
save_classifier.close()

In [80]:
logistic = SklearnClassifier(LogisticRegression())
logistic.train(train)
print(f"Accuracy : {nltk.classify.accuracy(logistic, test)*100}")

Accuracy : 71.53614457831326


In [0]:
save_classifier = open("Logistic.pickle","wb")
pickle.dump(logistic, save_classifier)
save_classifier.close()

In [82]:
sgd = SklearnClassifier(SGDClassifier())
sgd.train(train)
print(f"Accuracy : {nltk.classify.accuracy(sgd, test)*100}")

Accuracy : 69.57831325301204


In [0]:
save_classifier = open("SGD.pickle","wb")
pickle.dump(sgd, save_classifier)
save_classifier.close()

In [84]:
voted_classifier = VoteClassifier(sgd, logistic, bernoulli, multi, classifier)
print(f"Accuracy : {nltk.classify.accuracy(voted_classifier, test)*100}")


Accuracy : 69.57831325301204


In [85]:
print(f"Output {voted_classifier.classify(test[0][0])} Confidence {voted_classifier.confidence(test[0][0])}")

Output pos Confidence 1.0


In [86]:
review = "This movie was amazing. Would recommend to Everyone"
print(f"Class : {voted_classifier.classify(get_features(review))}")
print(f"Confidence : {voted_classifier.confidence(get_features(review))}")

Class : pos
Confidence : 1.0


In [87]:
review = "Todd is a horrible player, the manager was too stupid to bench him"
print(f"Class : {voted_classifier.classify(get_features(review))}")
print(f"Confidence : {voted_classifier.confidence(get_features(review))}")

Class : neg
Confidence : 1.0
