In [None]:
import nltk

In [None]:
nltk.download("movie_reviews")

In [None]:
from nltk.corpus import movie_reviews

The `fileids` method provided by all the datasets in `nltk.corpus` gives access to a list of all the files available.

In particular in the movie_reviews dataset we have 2000 text files, each of them is a review of a movie, and they are already split in a `neg` folder for the negative reviews and a `pos` folder for the positive reviews:

In [None]:
len(movie_reviews.fileids())

In [None]:
movie_reviews.fileids()[:5]

In [None]:
movie_reviews.fileids()[-5:]

In [None]:
negative_fileids = movie_reviews.fileids('neg')
positive_fileids = movie_reviews.fileids('pos')

In [None]:
len(negative_fileids), len(positive_fileids)

In [None]:
print(movie_reviews.raw(fileids=positive_fileids[0]))

In [None]:
nltk.download("punkt")

In [None]:
movie_reviews.words(fileids=positive_fileids[0])

In [None]:
{word:True for word in romeo_words}

In [None]:
type(_)

In [None]:
def build_bag_of_words_features(words):
    return {word:True for word in words}

In [None]:
build_bag_of_words_features(romeo_words)

In [None]:
nltk.download("stopwords")

In [None]:
import string

In [None]:
string.punctuation

In [None]:
useless_words = nltk.corpus.stopwords.words("english") + list(string.punctuation)
#useless_words
#type(useless_words)

In [None]:
def build_bag_of_words_features_filtered(words):
    return {
        word:1 for word in words \
        if not word in useless_words}

In [None]:
all_words = movie_reviews.words()
len(all_words)/1e6

In [None]:
filtered_words = [word for word in movie_reviews.words() if not word in useless_words]
type(filtered_words)

In [None]:
len(filtered_words)/1e6

In [None]:
from collections import Counter

word_counter = Counter(filtered_words)

In [None]:
most_common_words = word_counter.most_common()[:10]

In [None]:
most_common_words

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
sorted_word_counts = sorted(list(word_counter.values()), reverse=True)

plt.loglog(sorted_word_counts)
plt.ylabel("Freq")
plt.xlabel("Word Rank");

Another related plot is the histogram of `sorted_word_counts`, which displays how many words have a count in a specific range.

Of course the distribution is highly peaked at low counts, i.e. most of the words appear which a low count, so we better display it on semilogarithmic axes to inspect the tail of the distribution.

In [None]:
plt.hist(sorted_word_counts, bins=50);

In [None]:
plt.hist(sorted_word_counts, bins=50, log=True);

## Train a Classifier for Sentiment Analysis

Using our `build_bag_of_words_features` function we can build separately the negative and positive features.
Basically for each of the 1000 negative and for the 1000 positive review, we create one dictionary of the words and we associate the label "neg" and "pos" to it.

In [None]:
negative_features = [
    (build_bag_of_words_features_filtered(movie_reviews.words(fileids=[f])), 'neg') \
    for f in negative_fileids
]

In [None]:
print(negative_features[3])

In [None]:
positive_features = [
    (build_bag_of_words_features_filtered(movie_reviews.words(fileids=[f])), 'pos') \
    for f in positive_fileids
]

In [None]:
print(positive_features[6])

In [None]:
from nltk.classify import NaiveBayesClassifier

One of the simplest supervised machine learning classifiers is the Naive Bayes Classifier, it can be trained on 80% of the data to learn what words are generally associated with positive or with negative reviews.

In [None]:
split = 800

In [None]:
sentiment_classifier = NaiveBayesClassifier.train(positive_features[:split]+negative_features[:split])

We can check after training what is the accuracy on the training set, i.e. the same data used for training, we expect this to be a very high number because the algorithm already "saw" those data. Accuracy is the fraction of the data that is classified correctly, we can turn it into percent:

In [None]:
nltk.classify.util.accuracy(sentiment_classifier, positive_features[:split]+negative_features[:split])*100

The accuracy above is mostly a check that nothing went very wrong in the training, the real measure of accuracy is on the remaining 20% of the data that wasn't used in training, the test data:

In [None]:
nltk.classify.util.accuracy(sentiment_classifier, positive_features[split:]+negative_features[split:])*100

Accuracy here is around 70% which is pretty good for such a simple model if we consider that the estimated accuracy for a person is about 80%.
We can finally print the most informative features, i.e. the words that mostly identify a positive or a negative review:

In [None]:
sentiment_classifier.show_most_informative_features()