In [1]:
import nltk
import random
from nltk.corpus import movie_reviews

In [2]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [3]:
random.shuffle(documents)

In [4]:
all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

In [5]:
all_words = nltk.FreqDist(all_words)

In [6]:
len(all_words)

39768

In [17]:
# all_words.items()

In [18]:
word_features = list(all_words.keys())[:3000]
# word_features: This extracts the top 3,000 most frequent words from the frequency distribution and stores them in a list. 
# These top 3,000 words will be used as the features to classify whether a review is positive or negative.

In [19]:
for feat in word_features[:20]:
    print(feat)

plot
:
two
teen
couples
go
to
a
church
party
,
drink
and
then
drive
.
they
get
into
an


In [20]:
#build a quick function that will find these top 3,000 words in our positive and negative documents,
#marking their presence as either positive(true) or negative(false):

def find_features(review):
    words = set(review)
    features = {}
    for w in word_features:
        features[w] = (w in words)     # w in words will be either True or False

    return features

In [21]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [22]:
len(featuresets)

2000

In [23]:
featuresets[:1]

[({'plot': True,
   ':': True,
   'two': False,
   'teen': False,
   'couples': False,
   'go': False,
   'to': True,
   'a': True,
   'church': False,
   'party': False,
   ',': True,
   'drink': False,
   'and': True,
   'then': False,
   'drive': False,
   '.': True,
   'they': True,
   'get': True,
   'into': True,
   'an': True,
   'accident': False,
   'one': True,
   'of': True,
   'the': True,
   'guys': False,
   'dies': False,
   'but': True,
   'his': True,
   'girlfriend': False,
   'continues': False,
   'see': False,
   'him': True,
   'in': True,
   'her': False,
   'life': False,
   'has': False,
   'nightmares': False,
   'what': False,
   "'": True,
   's': True,
   'deal': False,
   '?': True,
   'watch': True,
   'movie': True,
   '"': True,
   'sorta': False,
   'find': False,
   'out': True,
   'critique': False,
   'mind': False,
   '-': True,
   'fuck': False,
   'for': True,
   'generation': False,
   'that': True,
   'touches': False,
   'on': True,
   'very':

<br><br><br>
## Naive Bayes start:

In [24]:
0.8*3000

2400.0

### training set:

In [27]:
training_set = featuresets[:1900]
# training_set[0]

### testing set:

In [28]:
testing_set = featuresets[1900:]

<br><br><br>
### Creating classifier:

In [29]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

<br><br>
### Testing and accuracy:

In [31]:
print(f"Classifier accuracy: {nltk.classify.accuracy(classifier, testing_set)*100}%")

Classifier accuracy: 82.0%


<br><br>
### The most valuable words are when it comes to positive or negative reviews:

In [34]:
classifier.show_most_informative_features(15)

Most Informative Features
                   sucks = True              neg : pos    =      9.4 : 1.0
                bothered = True              neg : pos    =      9.0 : 1.0
                  annual = True              pos : neg    =      9.0 : 1.0
                 frances = True              pos : neg    =      9.0 : 1.0
             silverstone = True              neg : pos    =      7.7 : 1.0
              schumacher = True              neg : pos    =      7.4 : 1.0
                 idiotic = True              neg : pos    =      7.3 : 1.0
                    mena = True              neg : pos    =      7.0 : 1.0
                  suvari = True              neg : pos    =      7.0 : 1.0
           unimaginative = True              neg : pos    =      7.0 : 1.0
               atrocious = True              neg : pos    =      6.6 : 1.0
                  regard = True              pos : neg    =      6.6 : 1.0
                  shoddy = True              neg : pos    =      6.4 : 1.0

<br><br>
### Saving the created classifier:

In [35]:
import pickle

In [37]:
filename = "naivebayes.pickle"

# save model
pickle.dump(classifier, open(filename, "wb"))