# day 6 of #66daysofdata_NLP
## NLTK: part 5
## Creating a module for Sentiment Analysis with NLTK

* ref: 
    - [https://pythonprogramming.net](https://pythonprogramming.net)
    - [https://kaggle.com](https://www.kaggle.com/alvations/sklearn-nltk-voteclassifier)

Warning!  This process will take a while.. You may want to just go run some errands. It took me about `30-40 minutes` to run it in full, and I am running an i7 3930k


In [1]:
# Step One: Import nltk and download necessary packages
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import nltk
import random
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from scipy import stats as s

from nltk.tokenize import word_tokenize

In [2]:
class pathes():
    path_to_pickle_doc           = 'pickled_algos/documents.pickle'
    path_to_pickle_word_features = 'pickled_algos/word_features5k.pickle'
p_path = pathes

# load the new movie review data
with open("../datasets/positive.txt","r", encoding='utf8', errors='ignore') as pos:
    short_pos = pos.readlines()
with open("../datasets/negative.txt","r", encoding='utf8', errors='ignore') as neg:
    short_neg = neg.readlines()

In [3]:
# create training and testing set (see prev. tutorial)
def find_features(document, word_features):
        words = word_tokenize(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)
        return features
    
def movie_reviews_to_features(p_path, verbose=0):
# move this up here
    all_words = []
    documents = []

    #  j is adject, r is adverb, and v is verb
    #allowed_word_types = ["J","R","V"]
    allowed_word_types = ["J"]

    def create_document(all_words, documents, reviews, label):
        for p in reviews:
            documents.append( (p, label) )
            words = word_tokenize(p)
            pos = nltk.pos_tag(words)
            for w in pos:
                if w[1][0] in allowed_word_types:
                    all_words.append(w[0].lower())
        return all_words, documents
    all_words, documents = create_document(all_words, documents, short_pos, 'pos')
    all_words, documents = create_document(all_words, documents, short_neg, 'neg') 
    
    if verbose ==1:
        rev_label = documents[0][1].upper()
        rev       = documents[0][0]
        print("A sample '{}' review : \n{}\n".format(rev_label, rev))
    
    # save documents
    with open(p_path.path_to_pickle_doc, 'wb') as f:
        pickle.dump(documents, f)   


    # most common words and their counts
    all_words = nltk.FreqDist(all_words)
    if verbose ==1:
        print('The 3 most common words and their counts: \n{}\n'.format(all_words.most_common(3)))
    # word_features: contains the top 5,000 most common words.
    word_features = list(all_words.keys())[:5000]
    
    # save the word_features
    with open(p_path.path_to_pickle_word_features, 'wb') as f:
        pickle.dump(word_features, f)

    

    featuresets = [(find_features(rev, word_features), category) for (rev, category) in documents]

    random.shuffle(featuresets)
    if verbose ==1:
        print(f"len of the featuresets is: {len(featuresets)}\n")

    if verbose ==1:
        print("An Example of first 5 words of a sample featureset of a {} review:\n{}, label --> {} \n('True' means the word is in top 5,000 most common words)".
              format(featuresets[0][1],{k: featuresets[0][0][k] for k in list(featuresets[0][0])[:5]}, featuresets[0][1]))
    return featuresets



In [4]:
featuresets = movie_reviews_to_features(p_path, verbose=1)

A sample 'POS' review : 
the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 


The 3 most common words and their counts: 
[('good', 369), ('more', 331), ('little', 265)]

len of the featuresets is: 10662

An Example of first 5 words of a sample featureset of a neg review:
{'21st': False, 'new': False, 'conan': False, 'greater': False, 'jean-claud': False}, label --> neg 
('True' means the word is in top 5,000 most common words)


In [5]:
# train and test split
testing_set = featuresets[10000:]
training_set = featuresets[:10000]

## Sklearn + NLTK VoteClassifier

In [6]:
#  'mode' will be our method for choosing the most popular vote
from statistics import mode 

#our classifier class:
# we want our new classifier to act like a typical NLTK classifier, 
# we can just be sure to inherit from the NLTK classifier class. 
class VoteClassifier:
    
    def __init__(self, *classifiers_objs):        
        #we're assigning the list of classifiers that are passed to our class to self._classifiers.
        self.classifiers_objs = classifiers_objs
        
    def train(self, training_set):
        self._classifiers = {}
        for clf_obj in self.classifiers_objs:
            # NaiveBayesClassifier is part of nltk
            if hasattr(clf_obj, '__name__') and clf_obj.__name__ == 'NaiveBayesClassifier':
                clf_name = 'NaiveBayesClassifier'
                print('Training', clf_name +'\t'+ str(clf_obj))
                clf_obj = nltk.NaiveBayesClassifier.train(training_set)
            else:
                clf_name = str(clf_obj).split('(')[1]
                print('Training', clf_name +'\t'+ str(clf_obj))
                clf_obj.train(training_set)
            self._classifiers[clf_name] = clf_obj

    def evaluate(self, testing_set):
        documents, labels = zip(*testing_set)
        predictions = self.classify_documents(documents)
        correct = [y == y_hat for y, y_hat in zip(labels, predictions)]
        if correct:
            return sum(correct) / len(correct)
        else:
            return 0,0

    def classify_documents(self, documents):
        return [self.classify_many(doc) for doc in documents]

    def classify_many(self, features):
        votes = []
        for clf_name, clf  in self._classifiers.items():
            v = clf.classify(features)
            votes.append(v)
        #print(votes, mode(votes))
        return s.mode(votes)[0][0]

    def confidence(self, features):
        votes = []
        for clf_name, clf  in self._classifiers.items():
            v = clf.classify(features)
            votes.append(v)
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [7]:
# initiate the VoteClassifier
voted_classifier = VoteClassifier(nltk.NaiveBayesClassifier,
                                  SklearnClassifier(MultinomialNB()), 
                                  SklearnClassifier(BernoulliNB()), 
                                  SklearnClassifier(LogisticRegression()),
                                  SklearnClassifier(SGDClassifier()),
                                  SklearnClassifier(LinearSVC()),
                                  #SklearnClassifier(NuSVC())
                                 )

# train the VoteClassifier
voted_classifier.train(training_set)

Training NaiveBayesClassifier	<class 'nltk.classify.naivebayes.NaiveBayesClassifier'>
Training MultinomialNB	<SklearnClassifier(MultinomialNB())>
Training BernoulliNB	<SklearnClassifier(BernoulliNB())>
Training LogisticRegression	<SklearnClassifier(LogisticRegression())>
Training SGDClassifier	<SklearnClassifier(SGDClassifier())>
Training LinearSVC	<SklearnClassifier(LinearSVC())>


In [8]:
print('Accuracy:\n-------------------------')

for clf_name, clf in voted_classifier._classifiers.items():
    print(clf_name, '\t', nltk.classify.accuracy(clf, testing_set)*100)
print('-------------------------')
print('VotedClassifier', '\t', voted_classifier.evaluate(testing_set)*100)

Accuracy:
-------------------------
NaiveBayesClassifier 	 74.47129909365559
MultinomialNB 	 73.71601208459214
BernoulliNB 	 74.01812688821752
LogisticRegression 	 71.90332326283988
SGDClassifier 	 71.6012084592145
LinearSVC 	 71.29909365558912
-------------------------
VotedClassifier 	 73.86706948640483


In [31]:
# load the word features
with open(p_path.path_to_pickle_word_features,"rb") as vf:
    word_features = pickle.load(vf)

def sentiment(text):
    feats = find_features(text, word_features)
    return voted_classifier.classify_many(feats), voted_classifier.confidence(feats)*100

In [34]:

sample_review_01 = "This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so good!"
sample_review_02 = "This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10"

print("Prediction result for 'sample_review_01' is --> {} ,Confidence --> {}%".format(sentiment(sample_review_01)[0], sentiment(sample_review_01)[1]))
print("Prediction result for 'sample_review_02' is --> {} ,Confidence --> {}%".format(sentiment(sample_review_02)[0], sentiment(sample_review_02)[1]))


Prediction result for 'sample_review_01' is --> pos ,Confidence --> 100.0%
Prediction result for 'sample_review_02' is --> neg ,Confidence --> 100.0%
