In [8]:
import nltk
import random
import pickle
import pathlib
from nltk.corpus import stopwords
from nltk.corpus import movie_reviews

## Dataset

In [3]:
# lets create a  dataset using movie reviws data
documents = []
for category in movie_reviews.categories():
    for file in movie_reviews.fileids(category):
        review = movie_reviews.words(fileids=file)
        documents.append((review,category))
        
# total documents 
print('total documents in our dataset',len(documents))

# let's shuffle the data
random.shuffle(documents)

# lets check the first five doc + labels
documents[:5]

total documents in our dataset 2000


[(['it', "'", 's', 'now', 'the', 'anniversary', 'of', ...], 'neg'),
 (['my', 'summer', 'was', 'recently', 'saved', 'by', ...], 'pos'),
 (['just', 'look', 'back', 'two', 'years', 'ago', 'at', ...], 'pos'),
 (['it', "'", 's', 'difficult', 'to', 'expect', 'much', ...], 'neg'),
 (['roberto', 'benigni', 'is', 'a', 'clown', 'in', 'the', ...], 'pos')]

## Vocabulary 

In [4]:
# this give us all the words in the movie review data
vocab = movie_reviews.words()
# length of vocab
print('Length of vocabulary',len(vocab))

# stopwords
english_stopwords = stopwords.words('english')
print('STOP WORDS',english_stopwords[:5])

# improved vocabulary for movie review classification problem
vocab = [word for word in movie_reviews.words() if word not in english_stopwords]


# lets check the distribution of words
distribution = nltk.FreqDist(vocab)
print('Most common words in the dataset',distribution.most_common()[:10])

# length of vocab
print('Length of vocabulary after removing stop words',len(vocab))

Length of vocabulary 1583820
STOP WORDS ['i', 'me', 'my', 'myself', 'we']
Most common words in the dataset [(',', 77717), ('.', 65876), ("'", 30585), ('"', 17612), ('-', 15595), (')', 11781), ('(', 11664), ('film', 9517), ('one', 5852), ('movie', 5771)]
Length of vocabulary after removing stop words 955610


## Creating Feature vector

In [5]:
### lets take top 500 frequent words from the vocab

frequent_500 = distribution.most_common()[:500]

# for each word in this list
# if that word is in the movie review we put a '1' or 'True'
# else we put '0' or 'False'
# the result will be a multi-hot-vector
frequent_500 = [tup[0] for tup in frequent_500]



## Multi hot vectors
feature_vectors = []

for review, sentiment in documents:
    multi_hot_vector = {}
    # we are converting reviews into a dict
    # words in review are keys 
    # and for every key we set True as value    
    review_lookup = {word:True for word in review}
    
    for word in frequent_500:
        try:
            if review_lookup[word]:
                multi_hot_vector[word] = True
        except:
            multi_hot_vector[word] = False
            
    feature_vectors.append(tuple([multi_hot_vector, sentiment]))  
    
    
# Test driven development
assert len(documents) == len(feature_vectors)   

## Train Test Split

In [6]:
train_data = feature_vectors[:1900]
test_data = feature_vectors[1900:]

## Bayes Classifier

In [7]:
# training
bayes = nltk.NaiveBayesClassifier.train(train_data)
# score on test
accuracy = nltk.classify.accuracy(bayes, test_data)
print('Accuracy : ',accuracy)

Accuracy :  0.78


## Save the model

In [9]:
path = pathlib.Path('./saved_models/')
path

WindowsPath('saved_models')

In [15]:
with open(path/'bayes_classifier.pickle','wb') as f:
    pickle.dump(bayes, f)


## load the model

In [17]:
with open(path/'bayes_classifier.pickle','rb') as f:
    reloaded_bayes = pickle.load(f)

In [20]:
accuracy = nltk.classify.accuracy(reloaded_bayes, test_data)
print('Accuracy : ',accuracy)

Accuracy :  0.78


In [23]:
# print 10 most useful feature components  ie - words

reloaded_bayes.most_informative_features(10)

[('worst', True),
 ('stupid', True),
 ('boring', True),
 ('perfect', True),
 ('supposed', True),
 ('worse', True),
 ('none', True),
 ('others', True),
 ('oscar', True),
 ('performances', True)]

## Let's use the model for classfying review

In [25]:
review = test_data[0][0]
true_label = test_data[0][1]

prediction = reloaded_bayes.classify(review)
print("Model prediction",prediction)
print("True label",true_label)

Model prediction pos
True label pos
