In [32]:
import nltk
import random
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.classify import ClassifierI
from statistics import mode
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC,NuSVC

In [8]:
class VoteClassifier(ClassifierI):
    def __init__(self,*classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v= c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v= c.classify(features)
            votes.append(v)
        choice_votes = votes.count(mode(votes)) 
        conf = choice_votes/len(votes)
        return conf

In [14]:
pos = open('reviews/positive.txt','r').read()
neg = open('reviews/negative.txt','r').read()

In [15]:
documents = []

In [16]:
for r in pos.split('\n'):
    documents.append((r,"pos"))
for r in neg.split('\n'):
    documents.append((r,"neg"))

In [18]:
stop_words = set(stopwords.words('english'))
all_words=[]
pos_words = word_tokenize(pos)
neg_words = word_tokenize(neg) 

for w in pos_words:
    if w not in stop_words:
        all_words.append(w.lower())
for w in neg_words:
    if w not in stop_words:
        all_words.append(w.lower())
    

In [20]:
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:5000]

In [21]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w]  = (w in words)
    return features

In [22]:
featuresets = [(find_features(rev),category) for (rev,category) in documents]
len(featuresets)

10662

In [23]:
random.shuffle(featuresets)

In [25]:
training_set = featuresets[:9000]
testing_set = featuresets[9000:]

In [26]:
## All Classifiers

In [28]:
##NB
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Accuracy",(nltk.classify.accuracy(classifier,testing_set))*100)
classifier.show_most_informative_features(15)

Accuracy 52.34657039711191
Most Informative Features
                       ? = True              neg : pos    =      2.7 : 1.0
                       4 = True              neg : pos    =      2.2 : 1.0
                       * = True              pos : neg    =      2.2 : 1.0
                       ; = True              neg : pos    =      1.8 : 1.0
                       & = True              pos : neg    =      1.7 : 1.0
                       : = True              neg : pos    =      1.4 : 1.0
                       r = False             neg : pos    =      1.3 : 1.0
                       e = False             pos : neg    =      1.2 : 1.0
                       ) = True              neg : pos    =      1.2 : 1.0
                       h = False             pos : neg    =      1.2 : 1.0
                       ( = True              neg : pos    =      1.2 : 1.0
                       c = False             neg : pos    =      1.2 : 1.0
                       3 = True              ne

In [29]:
## Multinomial NB
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("Accuracy",(nltk.classify.accuracy(MNB_classifier,testing_set))*100)

Accuracy 51.44404332129964


In [30]:
## BernoulliNB
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("Accuracy",(nltk.classify.accuracy(BNB_classifier,testing_set))*100)

Accuracy 52.647412755716005


In [33]:
## Logistice Regression
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("Accuracy",(nltk.classify.accuracy(LogisticRegression_classifier,testing_set))*100)

Accuracy 51.624548736462096


In [34]:
##SGD Classifier
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("Accuracy",(nltk.classify.accuracy(SGDClassifier_classifier,testing_set))*100)

Accuracy 51.26353790613718


In [35]:
## svc
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("Accuracy",(nltk.classify.accuracy(SVC_classifier,testing_set))*100)

Accuracy 52.76774969915764


In [36]:
## Linear SVC
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("Accuracy",(nltk.classify.accuracy(LinearSVC_classifier,testing_set))*100)

Accuracy 51.74488567990373


In [37]:
## NuSVC
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("Accuracy",(nltk.classify.accuracy(NuSVC_classifier,testing_set))*100)

Accuracy 52.9482551143201


In [39]:
## Ensemble models

voted_classifier = VoteClassifier(classifier,MNB_classifier, LogisticRegression_classifier, SGDClassifier_classifier,SVC_classifier, LinearSVC_classifier, NuSVC_classifier)
print("Accuracy",(nltk.classify.accuracy(voted_classifier,testing_set))*100)

Accuracy 52.888086642599276


In [40]:
print("Classification:",voted_classifier.classify(testing_set[0][0]), "Confidence:", voted_classifier.confidence(testing_set[0][0]))

Classification: pos Confidence: 0.8571428571428571


In [41]:
print("Classification:",voted_classifier.classify(testing_set[2][0]), "Confidence:", voted_classifier.confidence(testing_set[2][0]))

Classification: neg Confidence: 0.7142857142857143


In [42]:
print("Classification:",voted_classifier.classify(testing_set[5][0]), "Confidence:", voted_classifier.confidence(testing_set[5][0]))

Classification: pos Confidence: 1.0
