In [85]:

import random
import pickle
import pathlib

import nltk
from nltk.corpus import stopwords
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, CategoricalNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier

from statistics import mode


## Import Data

In [86]:
# lets create a  dataset using movie reviws data
documents = []
for category in movie_reviews.categories():
    for file in movie_reviews.fileids(category):
        review = movie_reviews.words(fileids=file)
        documents.append((review,category))
        
# total documents 
print('total documents in our dataset',len(documents))

# let's shuffle the data
random.seed(42)
random.shuffle(documents)

# lets check the first five doc + labels
documents[:5]

total documents in our dataset 2000


[(['mr', '.', 'bean', ',', 'a', 'bumbling', 'security', ...], 'neg'),
 (['when', 'casting', 'the', 'key', 'part', 'of', 'the', ...], 'pos'),
 (['there', 'is', 'a', 'scene', 'in', 'patch', 'adams', ...], 'neg'),
 (['and', 'i', 'thought', '"', 'stigmata', '"', 'would', ...], 'neg'),
 (['some', 'critics', ',', 'including', 'siskel', '&', ...], 'pos')]

## Vocab

In [87]:
vocab = movie_reviews.words()

# STOP WORDS
STOP_WORDS = stopwords.words('english')
print("STOP WORDS :",STOP_WORDS[:5])

# improve the vocab
vocab = [word for word in vocab if word not in STOP_WORDS]
print('length of vocab',len(vocab))

STOP WORDS : ['i', 'me', 'my', 'myself', 'we']
length of vocab 955610


# Frequency distribution

In [88]:
freq = nltk.FreqDist(vocab)
# top most common words
top_500 = freq.most_common()[:500]
top_500 = [tup[0] for tup in top_500]

## Feature Extraction

In [89]:
%%time
features = []

for review, sentiment in documents:
    lookup = {word:True for word in review}
    mulit_hot_vector = {}
    for word in top_500:
        try:
            if lookup[word]:
                mulit_hot_vector[word]=True
        except:
            mulit_hot_vector[word]=False
    
    features.append((mulit_hot_vector,sentiment))
    

# Test driven development
assert len(documents) == len(features)   

CPU times: total: 2.06 s
Wall time: 2.05 s


## Train Test split

In [90]:
train_set = features[:1900]
test_set = features[1900:]

## NLTK Bayes classifier

In [91]:
bayes = nltk.NaiveBayesClassifier.train(train_set)
accuracy = nltk.classify.accuracy(bayes, test_set)
print('Model accuracy : ', accuracy)

Model accuracy :  0.71


## Scikit Learn Classifiers

* Bayes
    * GaussianNB 
    * MultinomialNB
    * ComplementNB
    * CategoricalNB
    * BernoulliNB
* Linear models
    * LogisticRegression
    * SGDClassifier

##  Gaussian Naive Bayes

In [92]:
gnb = SklearnClassifier(GaussianNB(),sparse=False)
# training
gnb.train(train_set)
# accuracy
accuracy = nltk.classify.accuracy(gnb, test_set)
print('Model accuracy : ', accuracy)

Model accuracy :  0.73


## Multinomial Naive Bayes

In [93]:
mnb = SklearnClassifier(MultinomialNB())
# training
mnb.train(train_set)
# accuracy
accuracy = nltk.classify.accuracy(mnb, test_set)
print('Model accuracy : ', accuracy)

Model accuracy :  0.68


## Complement Naive Bayes

In [94]:
cnb = SklearnClassifier(ComplementNB())
# training
cnb.train(train_set)
# accuracy
accuracy = nltk.classify.accuracy(cnb, test_set)
print('Model accuracy : ', accuracy)

Model accuracy :  0.68


## Bernoulli Naive Bayes

In [95]:
bnb = SklearnClassifier(BernoulliNB())
# training
bnb.train(train_set)
# accuracy
accuracy = nltk.classify.accuracy(bnb, test_set)
print('Model accuracy : ', accuracy)

Model accuracy :  0.71


## Categorical Naive Bayes

In [96]:
catnb = SklearnClassifier(CategoricalNB(), sparse=False)
# training
catnb.train(train_set)
# accuracy
accuracy = nltk.classify.accuracy(catnb, test_set)
print('Model accuracy : ', accuracy)

Model accuracy :  0.71


## Logistic Regression

In [97]:
lr = SklearnClassifier(LogisticRegression(max_iter=500))
# training
lr.train(train_set)
# accuracy
accuracy = nltk.classify.accuracy(lr, test_set)
print('Model accuracy : ', accuracy)

Model accuracy :  0.75


## SGD Classifier

In [98]:
sgd = SklearnClassifier(SGDClassifier())
# training
sgd.train(train_set)
# accuracy
accuracy = nltk.classify.accuracy(sgd, test_set)
print('Model accuracy : ', accuracy)

Model accuracy :  0.73


## Voting and Confidence

In [99]:
# collect predictions givens by different model

model_outputs = [] 

for model in [gnb, mnb, cnb, bnb, catnb, lr, sgd]:
    outputs = model.classify_many([tup[0] for tup in test_set])
    model_outputs.append(outputs)


In [112]:
# confidence calculation via voting

votes = []

for i, votes in enumerate(zip(*model_outputs)):
    print('Test set review :',i)
    print('Votes : ',votes)
    print('Majority : ',mode(votes))
    print('True label : ',test_set[i][1])
    print(f'Prediction confidence : {( votes.count(mode(votes)) / len(votes) )*100 :.1f}%',  )
    print('..'*40)


Test set review : 0
Votes :  ('neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg')
Majority :  neg
True label :  neg
Prediction confidence : 100.0%
................................................................................
Test set review : 1
Votes :  ('pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos')
Majority :  pos
True label :  neg
Prediction confidence : 100.0%
................................................................................
Test set review : 2
Votes :  ('neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg')
Majority :  neg
True label :  neg
Prediction confidence : 100.0%
................................................................................
Test set review : 3
Votes :  ('pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos')
Majority :  pos
True label :  neg
Prediction confidence : 100.0%
................................................................................
Test set review : 4
Votes :  ('pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg')
Majority :  pos
True 