In [3]:
#!python -m nltk.downloader all

In [4]:
import nltk
from nltk.corpus import movie_reviews
import random

documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

In [6]:
len(documents)

2000

In [7]:
# Define the feature extractor

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [8]:
# Train Naive Bayes classifier
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)


In [9]:
# Test the classifier
print(nltk.classify.accuracy(classifier, test_set))

0.82


In [12]:
# Show the most important features as interpreted by Naive Bayes
classifier.show_most_informative_features(10)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     10.9 : 1.0
   contains(wonderfully) = True              pos : neg    =      8.5 : 1.0
         contains(mulan) = True              pos : neg    =      8.2 : 1.0
        contains(seagal) = True              neg : pos    =      7.9 : 1.0
         contains(damon) = True              pos : neg    =      6.3 : 1.0
          contains(lame) = True              neg : pos    =      6.0 : 1.0
         contains(flynt) = True              pos : neg    =      5.6 : 1.0
           contains(era) = True              pos : neg    =      5.5 : 1.0
        contains(wasted) = True              neg : pos    =      5.3 : 1.0
    contains(ridiculous) = True              neg : pos    =      5.3 : 1.0
