# day 6 of #66daysofdata_NLP
## NLTK: part 5
## Combining Algorithms (classifiers) with NLTK

* ref: 
    - [https://pythonprogramming.net](https://pythonprogramming.net)
    - [https://kaggle.com](https://www.kaggle.com/alvations/sklearn-nltk-voteclassifier)

Combining classifier algorithms is is a common technique, done by creating a sort of voting system, `where each algorithm gets one vote`, and the classification that has the votes votes is the chosen one.


In [1]:
# Step One: Import nltk and download necessary packages
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import numpy as np
import random
from nltk.corpus import movie_reviews
import nltk
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
# some more classifiers
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [2]:
# create training and testing set (see prev. tutorial)

def movie_reviews_to_features(verbose=0):
    # In each category (we have pos or neg), take all of the file IDs (each review has its own ID), 
    # then store the word_tokenized version (a list of words) for the file ID, 
    # followed by the positive or negative label in one big list. 
    documents = [(list(movie_reviews.words(fileid)), category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]
    
    #  random to shuffle our documents. This is because we're going to be training and testing. 
    random.shuffle(documents)
    # sample word_tokenized version of a review
    # where the first element is a list the words, and the 2nd element is the "pos" or "neg" label.
    if verbose ==1:
        print('Firs 5 words of a Sample {} review: \n{}'.format(documents[1][1],documents[1][0][:5]),'\n')

    all_words = []
    for w in movie_reviews.words():
        all_words.append(w.lower())

    all_words = nltk.FreqDist(all_words)
    if verbose ==1:
        print('The 3 most common words and their counts: \n{}\n'.format(all_words.most_common(3)))
    # word_features: contains the top 3,000 most common words.
    word_features = list(all_words.keys())[:3000]

    # find these top 3,000 words in our positive and negative documents,
    # marking their presence as either positive or negative
    def find_features(document):
        words = set(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)

        return features
    featuresets = [(find_features(rev), category) for (rev, category) in documents]
    if verbose ==1:
        print("An Example of first 5 words of a sample featureset of a {} review:\n{}, label --> {} \n('True' means the word is in top 3,000 most common words)".
              format(featuresets[0][1],{k: featuresets[0][0][k] for k in list(featuresets[0][0])[:5]}, featuresets[0][1]))
    return featuresets

featuresets = movie_reviews_to_features(verbose=1)
# set that we'll train our classifier with
training_set = featuresets[:1900]

# set that we'll test against.
testing_set = featuresets[1900:]

Firs 5 words of a Sample pos review: 
['assume', 'nothing', '.', 'the', 'phrase'] 

The 3 most common words and their counts: 
[(',', 77717), ('the', 76529), ('.', 65876)]

An Example of first 5 words of a sample featureset of a neg review:
{'plot': True, ':': True, 'two': False, 'teen': False, 'couples': False}, label --> neg 
('True' means the word is in top 3,000 most common words)


## Sklearn + NLTK VoteClassifier

In [3]:
#  'mode' will be our method for choosing the most popular vote
from statistics import mode 

#our classifier class:
# we want our new classifier to act like a typical NLTK classifier, 
# we can just be sure to inherit from the NLTK classifier class. 
class VoteClassifier:
    
    def __init__(self, *classifiers_objs):        
        #we're assigning the list of classifiers that are passed to our class to self._classifiers.
        self.classifiers_objs = classifiers_objs
        
    def train(self, training_set):
        self._classifiers = {}
        for clf_obj in self.classifiers_objs:
            # NaiveBayesClassifier is part of nltk
            if hasattr(clf_obj, '__name__') and clf_obj.__name__ == 'NaiveBayesClassifier':
                clf_name = 'NaiveBayesClassifier'
                print('Training', clf_name +'\t'+ str(clf_obj))
                clf_obj = nltk.NaiveBayesClassifier.train(training_set)
            else:
                clf_name = str(clf_obj).split('(')[1]
                print('Training', clf_name +'\t'+ str(clf_obj))
                clf_obj.train(training_set)
            self._classifiers[clf_name] = clf_obj

    def evaluate(self, testing_set):
        documents, labels = zip(*testing_set)
        predictions = self.classify_documents(documents)
        correct = [y == y_hat for y, y_hat in zip(labels, predictions)]
        if correct:
            return sum(correct) / len(correct)
        else:
            return 0,0

    def classify_documents(self, documents):
        return [self.classify_many(doc) for doc in documents]

    def classify_many(self, features):
        votes = []
        for clf_name, clf  in self._classifiers.items():
            v = clf.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for clf_name, clf  in self._classifiers.items():
            v = clf.classify(features)
            votes.append(v)
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [4]:
# initiate the VoteClassifier
voted_classifier = VoteClassifier(nltk.NaiveBayesClassifier,
                                  SklearnClassifier(MultinomialNB()), 
                                  SklearnClassifier(BernoulliNB()), 
                                  SklearnClassifier(LogisticRegression()),
                                  SklearnClassifier(SGDClassifier()),
                                  SklearnClassifier(LinearSVC()),
                                  SklearnClassifier(NuSVC())
                                 )

# train the VoteClassifier
voted_classifier.train(training_set)

Training NaiveBayesClassifier	<class 'nltk.classify.naivebayes.NaiveBayesClassifier'>
Training MultinomialNB	<SklearnClassifier(MultinomialNB())>
Training BernoulliNB	<SklearnClassifier(BernoulliNB())>
Training LogisticRegression	<SklearnClassifier(LogisticRegression())>
Training SGDClassifier	<SklearnClassifier(SGDClassifier())>
Training LinearSVC	<SklearnClassifier(LinearSVC())>
Training NuSVC	<SklearnClassifier(NuSVC())>


In [7]:
print('Accuracy:\n-------------------------')

for clf_name, clf in voted_classifier._classifiers.items():
    print(clf_name, '\t', nltk.classify.accuracy(clf, testing_set)*100)
print('-------------------------')
print('VotedClassifier', '\t', voted_classifier.evaluate(testing_set)*100)

Accuracy:
-------------------------
NaiveBayesClassifier 	 78.0
MultinomialNB 	 83.0
BernoulliNB 	 78.0
LogisticRegression 	 80.0
SGDClassifier 	 77.0
LinearSVC 	 80.0
NuSVC 	 83.0
-------------------------
VotedClassifier 	 85.0


In [6]:
#print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
print('Classifying some sample documents')
print("Classification:", voted_classifier.classify_many(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify_many(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify_many(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify_many(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify_many(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify_many(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)

Classifying some sample documents
Classification: pos Confidence %: 100.0
Classification: neg Confidence %: 71.42857142857143
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 100.0
Classification: neg Confidence %: 100.0
