<a href="https://colab.research.google.com/github/EbsHirani/MovieSentimentAnalysis/blob/master/moviesentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
import random
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
nltk.download("movie_reviews")
from nltk.corpus import movie_reviews

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [0]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode

In [0]:
class VoteClassifier(ClassifierI):
  def __init__(self, *classifiers):
    self._classifiers = classifiers
  def classify(self, features):
    votes = []
    for classifier in self._classifiers:
      vote = classifier.classify(features)
      votes.append(vote)
      return mode(votes)
  def confidence(self, features):
    votes = []
    for classifier in self._classifiers:
      vote = classifier.classify(features)
      votes.append(vote)
      return votes.count(mode(votes)) / len(votes)    

In [0]:
documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)  


In [3]:
word_set = [w.lower() for w in movie_reviews.words()]
word_freq = nltk.FreqDist(word_set)
print(word_freq.most_common(3))

[(',', 77717), ('the', 76529), ('.', 65876)]


In [0]:
word_features = list(word_freq.keys())[:3000]
def get_features(doc):
  word_set = set(doc)
  features = {}
  for word in word_features:
    features[word] = (word in word_set)
  return features
featureset = [(get_features(doc), category) for doc, category in documents]

In [5]:
train = featureset[:1900]
test = featureset[1900:]
classifier = nltk.NaiveBayesClassifier.train(train)
print(f"Accuracy : {nltk.classify.accuracy(classifier, test)*100}")



Accuracy : 84.0


In [6]:
classifier.show_most_informative_features()

Most Informative Features
                 miscast = True              neg : pos    =     14.0 : 1.0
                   sucks = True              neg : pos    =     10.4 : 1.0
                 frances = True              pos : neg    =      9.2 : 1.0
                  turkey = True              neg : pos    =      8.2 : 1.0
           unimaginative = True              neg : pos    =      8.1 : 1.0
                  annual = True              pos : neg    =      7.8 : 1.0
                  regard = True              pos : neg    =      7.2 : 1.0
                 idiotic = True              neg : pos    =      7.1 : 1.0
                  shoddy = True              neg : pos    =      6.8 : 1.0
                  suvari = True              neg : pos    =      6.8 : 1.0


In [7]:
multi = SklearnClassifier(MultinomialNB())
multi.train(train)
print(f"Accuracy : {nltk.classify.accuracy(multi, test)*100}")

Accuracy : 84.0


In [10]:
bernoulli = SklearnClassifier(BernoulliNB())
bernoulli.train(train)
print(f"Accuracy : {nltk.classify.accuracy(bernoulli, test)*100}")

Accuracy : 84.0


In [19]:
logistic = SklearnClassifier(LogisticRegression())
logistic.train(train)
print(f"Accuracy : {nltk.classify.accuracy(logistic, test)*100}")

Accuracy : 85.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [20]:
sgd = SklearnClassifier(SGDClassifier())
sgd.train(train)
print(f"Accuracy : {nltk.classify.accuracy(sgd, test)*100}")

Accuracy : 85.0


In [21]:
svc = SklearnClassifier(SVC())
svc.train(train)
print(f"Accuracy : {nltk.classify.accuracy(svc, test)*100}")


Accuracy : 85.0


In [22]:
lsvc = SklearnClassifier(LinearSVC())
lsvc.train(train)
print(f"Accuracy : {nltk.classify.accuracy(lsvc, test)*100}")


Accuracy : 85.0


In [23]:
nsvc = SklearnClassifier(NuSVC())
nsvc.train(train)
print(f"Accuracy : {nltk.classify.accuracy(nsvc, test)*100}")


Accuracy : 87.0


In [29]:
voted_classifier = VoteClassifier(nsvc, lsvc, svc, sgd, logistic, bernoulli, multi, classifier)
print(f"Accuracy : {nltk.classify.accuracy(voted_classifier, test)*100}")


Accuracy : 87.0


In [37]:
print(f"Output {voted_classifier.classify(test[0][0])} Confidence {voted_classifier.confidence(test[0][0])}")

Output neg Confidence 1.0


In [42]:
#" ".join(documents[0][0])
print("Classifying :" + " ".join(documents[0][0]) + f" \nClass : {voted_classifier.classify(get_features(documents[0][0]))} \nConfidence : {voted_classifier.confidence(get_features(documents[0][0]))}")

Classifying :is jimmy stewart the greatest actor of all - time ? it ' s quite possible . his career spanned over 40 years , and he acted in more movies than most actors ever could . yet , when he is talked about in the media , he is generally thought of as an actor who played one type of role : the nice guy . and that ' s really a shame . " the naked spur " features jimmy stewart in a role completely different than what people would expect from him . it ' s a western , which stewart specialized in around this period , and it casts him as a desperate man out to collect a bounty on a man who used to be his friend . before he finds that man , though , he runs into two men who agree to help him , thinking he is a sheriff . when the criminal is eventually caught , the two men discover stewart ' s secret , and decide they want a piece of the action too . the rest of the film is a suspenseful journey in which each man suspects the other constantly . also featured is the woman travelling with 