# day 5 of #66daysofdata_NLP
## NLTK: part 4
## 01. Scikit-Learn Sklearn with NLTK

* ref: 
    - [https://pythonprogramming.net](https://pythonprogramming.net)
    - [https://nanonets.com](https://nanonets.com/blog/named-entity-recognition-with-nltk-and-spacy/#what-is-named-entity-recognition)
    - [https://alvinntnu.github.io](https://alvinntnu.github.io/NTNU_ENC2045_LECTURES/nlp/ml-simple-case.html)

By loading  SklearnClassifier from NLTK you can can use just about any of the sklearn classifiers: 
- a couple more variations of the Naive Bayes algorithm: `MultinomialNB`,`BernoulliNB` 


In [1]:
# Step One: Import nltk and download necessary packages
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import numpy as np
import random
from nltk.corpus import movie_reviews
import nltk
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
# some more classifiers
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [2]:
# create training and testing set (see prev. tutorial)

def movie_reviews_to_features(verbose=0):
    # In each category (we have pos or neg), take all of the file IDs (each review has its own ID), 
    # then store the word_tokenized version (a list of words) for the file ID, 
    # followed by the positive or negative label in one big list. 
    documents = [(list(movie_reviews.words(fileid)), category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]
    
    #  random to shuffle our documents. This is because we're going to be training and testing. 
    random.shuffle(documents)
    # sample word_tokenized version of a review
    # where the first element is a list the words, and the 2nd element is the "pos" or "neg" label.
    if verbose ==1:
        print('Firs 5 words of a Sample {} review: \n{}'.format(documents[1][1],documents[1][0][:5]),'\n')

    all_words = []
    for w in movie_reviews.words():
        all_words.append(w.lower())

    all_words = nltk.FreqDist(all_words)
    if verbose ==1:
        print('The 3 most common words and their counts: \n{}\n'.format(all_words.most_common(3)))
    # word_features: contains the top 3,000 most common words.
    word_features = list(all_words.keys())[:3000]

    # find these top 3,000 words in our positive and negative documents,
    # marking their presence as either positive or negative
    def find_features(document):
        words = set(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)

        return features
    featuresets = [(find_features(rev), category) for (rev, category) in documents]
    if verbose ==1:
        print("An Example of first 5 words of a sample featureset of a {} review:\n{}, label --> {} \n('True' means the word is in top 3,000 most common words)".
              format(featuresets[0][1],{k: featuresets[0][0][k] for k in list(featuresets[0][0])[:5]}, featuresets[0][1]))
    return featuresets


featuresets = movie_reviews_to_features(verbose=1)
# set that we'll train our classifier with
training_set = featuresets[:1900]

# set that we'll test against.
testing_set = featuresets[1900:]

Firs 5 words of a Sample pos review: 
['it', "'", 's', 'not', 'often'] 

The 3 most common words and their counts: 
[(',', 77717), ('the', 76529), ('.', 65876)]

An Example of first 5 words of a sample featureset of a neg review:
{'plot': True, ':': True, 'two': False, 'teen': False, 'couples': False}, label --> neg 
('True' means the word is in top 3,000 most common words)


In [10]:
def cross_val(model, model_name, train_set, n_splits=10):
    import sklearn.model_selection
    kf = sklearn.model_selection.KFold(n_splits=10)
    acc_kf = []  ## accuracy holder

    ## Cross-validation
    for train_index, test_index in kf.split(train_set):
        #print("TRAIN:", train_index, "TEST:", test_index)
        train_fold = train_set[train_index[0]:train_index[len(train_index) - 1]]
        test_fold  = train_set[test_index[0]:test_index[len(test_index) - 1]]
        
        classifier   = model.train(train_fold)
        cur_fold_acc = nltk.classify.util.accuracy(classifier, test_fold)
        
        acc_kf.append(cur_fold_acc*100)
    print('{} accuracy: {} +-% {}'.format(model_name, np.mean(acc_kf),np.std(acc_kf)))
    

In [11]:
cross_val(nltk.NaiveBayesClassifier,'Original Naive Bayes Algo', training_set)

Original Naive Bayes Algo accuracy: 88.04232804232804 +-% 3.1140611575792927


In [6]:
# Next, we can define, and train our classifier like:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(5)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)



Original Naive Bayes Algo accuracy percent: 79.0
Most Informative Features
                   sucks = True              neg : pos    =      9.7 : 1.0
                  justin = True              neg : pos    =      9.6 : 1.0
                  annual = True              pos : neg    =      9.1 : 1.0
                 frances = True              pos : neg    =      9.1 : 1.0
                 idiotic = True              neg : pos    =      8.9 : 1.0
MNB_classifier accuracy percent: 83.0
BernoulliNB_classifier accuracy percent: 79.0
LogisticRegression_classifier accuracy percent: 84.0
SGDClassifier_classifier accuracy percent: 73.0
SVC_classifier accuracy percent: 81.0
LinearSVC_classifier accuracy percent: 83.0
NuSVC_classifier accuracy percent: 83.0
