In [1]:
import nltk
import random
# from nltk.corpus import movie_reviews

# to use sklearn api:
from nltk.classify.scikitlearn import SklearnClassifier

# Importing sklearn classifiers:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

# Additional Imports:
import pickle
from nltk.classify import ClassifierI
from statistics import mode

from nltk.tokenize import word_tokenize

# To filter all_words:
import string
from nltk.corpus import stopwords

Voting classifier combines the results of several individual classifiers to make a final prediction, improving accuracy through an ensemble approach.

In [16]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers        # private data member
    
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        print(f'votes: {votes}\tmode(votes): {mode(votes)}')
        return mode(votes)     # sends the majority class
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        choice_votes = votes.count(mode(votes))   # majority_votes
        conf = choice_votes/len(votes)
        return conf

VoteClassifier takes multiple classifiers and allows them to "vote" on the prediction of a given feature set (e.g., positive or negative sentiment).

The classify method collects votes from each classifier and returns the majority vote (mode).

The confidence method calculates the confidence level based on how many classifiers agreed on the majority vote.

In [3]:
short_pos = open('short_reviews/positive.txt', 'r').read()
short_neg = open('short_reviews/negative.txt', 'r').read()

In [21]:
# print(short_pos)
# print(short_neg)
# print(short_pos.split('\n'))

In [4]:
documents = []

for r in short_pos.split('\n'):
    documents.append((r, 'pos'))
    
for r in short_neg.split('\n'):
    documents.append((r, 'neg'))

In [5]:
all_words = []

short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)

# for w in short_pos_words:
#     all_words.append(w.lower())
    
# for w in short_neg_words:
#     all_words.append(w.lower())

stop_words = set(stopwords.words('english'))

all_words = []
for w in short_pos_words + short_neg_words:
    if w.isalpha() and w.lower() not in stop_words:  # Removes stopwords as well
        all_words.append(w.lower())
        
# print(all_words[:10])

In [6]:
all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:5000]

In [7]:
#build a quick function that will find these top 3,000 words in our positive and negative documents,
#marking their presence as either positive(true) or negative(false):

def find_features(review):
    words = set(word_tokenize(review))       # make sure the thing inside the set is a list of words of that review
    features = {}
    for w in word_features:
        features[w] = (w in words)     # w in words will be either True or False

    return features

In [8]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [9]:
featuresets[0]

({'rock': True,
  'destined': True,
  'century': True,
  'new': True,
  'conan': True,
  'going': True,
  'make': True,
  'splash': True,
  'even': True,
  'greater': True,
  'arnold': True,
  'schwarzenegger': True,
  'van': True,
  'damme': True,
  'steven': True,
  'segal': True,
  'gorgeously': False,
  'elaborate': False,
  'continuation': False,
  'lord': False,
  'rings': False,
  'trilogy': False,
  'huge': False,
  'column': False,
  'words': False,
  'adequately': False,
  'describe': False,
  'peter': False,
  'jackson': False,
  'expanded': False,
  'vision': False,
  'j': False,
  'r': False,
  'tolkien': False,
  'effective': False,
  'biopic': False,
  'sometimes': False,
  'like': False,
  'go': False,
  'movies': False,
  'fun': False,
  'wasabi': False,
  'good': False,
  'place': False,
  'start': False,
  'emerges': False,
  'something': False,
  'rare': False,
  'issue': False,
  'movie': False,
  'honest': False,
  'keenly': False,
  'observed': False,
  'feel': F

In [9]:
random.shuffle(featuresets)

In [10]:
len(featuresets)

10664

## Training and tesing set:

In [11]:
training_set = featuresets[:10000]
testing_set = featuresets[10000:]

In [79]:
# training_set[3]

<br><br><br>
## Creating models:

In [15]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print(f"Classifier accuracy: {nltk.classify.accuracy(classifier, testing_set)*100}%")
classifier.show_most_informative_features(15)

Classifier accuracy: 72.89156626506023%
Most Informative Features
              engrossing = True              pos : neg    =     21.6 : 1.0
               wonderful = True              pos : neg    =     20.9 : 1.0
                  stupid = True              neg : pos    =     18.4 : 1.0
               inventive = True              pos : neg    =     15.0 : 1.0
              refreshing = True              pos : neg    =     13.6 : 1.0
                    warm = True              pos : neg    =     13.4 : 1.0
               realistic = True              pos : neg    =     11.6 : 1.0
            refreshingly = True              pos : neg    =     11.6 : 1.0
                    skin = True              pos : neg    =     11.6 : 1.0
                captures = True              pos : neg    =     11.4 : 1.0
                provides = True              pos : neg    =     11.4 : 1.0
                    ages = True              pos : neg    =     10.3 : 1.0
                chilling = True   

In [82]:
filename = '15. pickled models/naivebayes.pickle'

pickle.dump(classifier, open(filename, 'wb'))

<br><br><br>

In [83]:
BNB_Classifier = SklearnClassifier(BernoulliNB())
BNB_Classifier.train(training_set)
print(f"Classifier accuracy: {nltk.classify.accuracy(BNB_Classifier, testing_set)*100}%")

filename = '15. pickled models/BernoulliNB.pickle'
pickle.dump(classifier, open(filename, 'wb'))

Classifier accuracy: 74.69879518072288%


In [84]:
MNB_Classifier = SklearnClassifier(MultinomialNB())
MNB_Classifier.train(training_set)
print(f"Classifier accuracy: {nltk.classify.accuracy(MNB_Classifier, testing_set)*100}%")

filename = '15. pickled models/MultinomialNB.pickle'
pickle.dump(classifier, open(filename, 'wb'))

Classifier accuracy: 74.84939759036145%


In [85]:
LR_Classifier = SklearnClassifier(LogisticRegression())
LR_Classifier.train(training_set)
print(f"Classifier accuracy: {nltk.classify.accuracy(LR_Classifier, testing_set)*100}%")

filename = '15. pickled models/LogisticRegression.pickle'
pickle.dump(classifier, open(filename, 'wb'))

Classifier accuracy: 74.24698795180723%


In [86]:
SGD_Classifier = SklearnClassifier(SGDClassifier())
SGD_Classifier.train(training_set)
print(f"Classifier accuracy: {nltk.classify.accuracy(SGD_Classifier, testing_set)*100}%")

filename = '15. pickled models/SGDClassifier.pickle'
pickle.dump(classifier, open(filename, 'wb'))

Classifier accuracy: 74.3975903614458%


In [None]:
# SVC_Classifier = SklearnClassifier(SVC())
# SVC_Classifier.train(training_set)
# print(f"Classifier accuracy: {nltk.classify.accuracy(SVC_Classifier, testing_set)*100}%")

# filename = '15. pickled models/SVC.pickle'
# pickle.dump(classifier, open(filename, 'wb'))

In [16]:
LinearSVC_Classifier = SklearnClassifier(LinearSVC())
LinearSVC_Classifier.train(training_set)
print(f"Classifier accuracy: {nltk.classify.accuracy(LinearSVC_Classifier, testing_set)*100}%")

filename = '15. pickled models/LinearSVC.pickle'
pickle.dump(classifier, open(filename, 'wb'))

Classifier accuracy: 71.3855421686747%


In [None]:
NuSVC_Classifier = SklearnClassifier(NuSVC())
NuSVC_Classifier.train(training_set)
print(f"Classifier accuracy: {nltk.classify.accuracy(NuSVC_Classifier, testing_set)*100}%")

filename = '15. pickled models/NuSVC.pickle'
pickle.dump(classifier, open(filename, 'wb'))

<br><br>

In [14]:
LinearSVC_classifier = pickle.load(open('15. pickled models/LinearSVC.pickle', 'rb'))
MNB_classifier = pickle.load(open('15. pickled models/MultinomialNB.pickle', 'rb'))
BernoulliNB_classifier = pickle.load(open('15. pickled models/BernoulliNB.pickle', 'rb'))
LogisticRegression_classifier = pickle.load(open('15. pickled models/LogisticRegression.pickle', 'rb'))

In [17]:
voted_classifier = VoteClassifier(
                                  LinearSVC_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

votes: ['neg', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['pos', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes

votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes

votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes

votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes

votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['neg', 'neg', 'neg', 'neg']	mode(votes): neg
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes): pos
votes: ['pos', 'pos', 'pos', 'pos']	mode(votes

In [19]:
# Testing confidence for a sample from testing set:
print("Classification:", voted_classifier.classify(testing_set[0][0]), 
      "Confidence %:", voted_classifier.confidence(testing_set[0][0]) * 100)

votes: ['neg', 'pos', 'pos', 'pos']	mode(votes): pos
Classification: pos Confidence %: 75.0
