In [9]:
import random
import pickle
import pathlib

import nltk
from nltk.corpus import stopwords
from nltk.corpus import movie_reviews
from nltk.classify import ClassifierI
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, CategoricalNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC

from statistics import mode


## Import Data

In [3]:
# lets create a  dataset using movie reviws data
documents = []
for category in movie_reviews.categories():
    for file in movie_reviews.fileids(category):
        review = movie_reviews.words(fileids=file)
        documents.append((review,category))
        
# total documents 
print('total documents in our dataset',len(documents))

# let's shuffle the data
random.seed(42)
random.shuffle(documents)

# lets check the first five doc + labels
documents[:5]

total documents in our dataset 2000


[(['mr', '.', 'bean', ',', 'a', 'bumbling', 'security', ...], 'neg'),
 (['when', 'casting', 'the', 'key', 'part', 'of', 'the', ...], 'pos'),
 (['there', 'is', 'a', 'scene', 'in', 'patch', 'adams', ...], 'neg'),
 (['and', 'i', 'thought', '"', 'stigmata', '"', 'would', ...], 'neg'),
 (['some', 'critics', ',', 'including', 'siskel', '&', ...], 'pos')]

## Vocab

In [5]:
vocab = movie_reviews.words()

# STOP WORDS
STOP_WORDS = stopwords.words('english')
print("STOP WORDS :",STOP_WORDS[:5])

# improve the vocab
vocab = [word for word in vocab if word not in STOP_WORDS]
print('length of vocab',len(vocab))

STOP WORDS : ['i', 'me', 'my', 'myself', 'we']
length of vocab 955610


## Feature Extraction

In [6]:
%%time


freq = nltk.FreqDist(vocab)
# top most common words
top_500 = freq.most_common()[:500]
top_500 = [tup[0] for tup in top_500]


features = []

for review, sentiment in documents:
    lookup = {word:True for word in review}
    mulit_hot_vector = {}
    for word in top_500:
        try:
            if lookup[word]:
                mulit_hot_vector[word]=True
        except:
            mulit_hot_vector[word]=False
    
    features.append((mulit_hot_vector,sentiment))
    

# Test driven development
assert len(documents) == len(features)   

CPU times: total: 2.66 s
Wall time: 2.67 s


## Train Test Split

In [7]:
train_set = features[:1900]
test_set = features[1900:]

## Training different Classifiers

In [12]:
# NLTK classifier
bayes = nltk.NaiveBayesClassifier.train(train_set)
# sklearn bayes classifiers
gnb = SklearnClassifier(GaussianNB(),sparse=False).train(train_set)
mnb = SklearnClassifier(MultinomialNB()).train(train_set)
cnb = SklearnClassifier(ComplementNB()).train(train_set)
bnb = SklearnClassifier(BernoulliNB()).train(train_set)
catnb = SklearnClassifier(CategoricalNB(), sparse=False).train(train_set)
# sklearn linear models
lr = SklearnClassifier(LogisticRegression(max_iter=500)).train(train_set)
sgd = SklearnClassifier(SGDClassifier()).train(train_set)
# Support vectors
svc =SklearnClassifier(LinearSVC(max_iter=5000)).train(train_set)

## Voting Classifier

In [19]:
class VotingClassifier(ClassifierI):
    
    def __init__(self, classifiers):
        self.classifiers = classifiers
        
    def classify(self, feature_set):
        self.feature_set = feature_set
        model_outputs = []
        for model in self.classifiers:
            outputs = model.classify_many([tup[0] for tup in feature_set])
            model_outputs.append(outputs)
        return model_outputs
    
    def vote_and_confidence(self, feature_set):
        self.confidence_scores = []
        model_outputs = self.classify(feature_set)
        
        for i, votes in enumerate(zip(*model_outputs)):
            majority = mode(votes)
            confidence = votes.count(majority) / len(votes)
            self.confidence_scores.append((majority, confidence))
            
        return self.confidence_scores
        

In [20]:
# creating an instance

classifiers = [gnb, mnb, cnb, bnb, catnb, lr, sgd, svc]

clf = VotingClassifier(classifiers)

In [21]:
# getting prediction and confidence for test-set reviews
predictions = clf.vote_and_confidence(test_set)

In [31]:
# check the ouput predictions 
def evaluate(predictions, i):
    true_label = test_set[i][1]
    predicted_label = predictions[i][0]
    confidence = predictions[i][1]
    print(f'''
    Model prediction for the test-review index:{i} is {predicted_label}
    \nThe true label is {true_label}\nConfidence:{confidence*100:.1f}
    ''')

In [32]:
evaluate(predictions, 0)


    Model prediction for the test-review index:0 is neg
    
The true label is neg
Confidence:100.0
    


In [33]:
evaluate(predictions, 10)


    Model prediction for the test-review index:10 is neg
    
The true label is neg
Confidence:100.0
    


In [34]:
evaluate(predictions, 90)


    Model prediction for the test-review index:90 is neg
    
The true label is neg
Confidence:100.0
    
