In [12]:
import nltk
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.metrics.scores import (precision, recall)
import collections
import string
import random

# sources
# (textbook): https://www.nltk.org/book/ch06.html
# https://stackabuse.com/text-classification-with-python-and-scikit-learn/
# https://www.g2.com/products/nltk/competitors/alternatives

In [14]:
## pre-processing ##
# removes punctuation and all common stopwords in english
stop = set(stopwords.words('english') + list(string.punctuation))
noPunct_all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words() if w not in stop)


## eye-level-control comparrison of pre-processing outcome ##
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
old_words = list(all_words)[:10]
words = list(noPunct_all_words)[:10]
print(old_words)
print(words)

[',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in']
['film', 'one', 'movie', 'like', 'even', 'good', 'time', 'story', 'would', 'much']


In [16]:
## 3. Feature selection ##

# Document array
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)


# currently the 3000 most important words, the last words have higher occurance than the first words, comparing to the end for more matches, hence more relevant data.
word_features = list(noPunct_all_words)[:3000]

def get_features(document):
    document_words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in document_words)
    return features

## example useage
# print((get_features(movie_reviews.words('neg/cv000_29416.txt'))))

In [18]:
## Train Classifier ##

featuresets = [(get_features(d), c) for (d,c) in documents]

# "you should evaluate your model on 10% of movereview model, 10% 2000 = 200"
# evaluation-set
test_set = featuresets[:200]

# training-set, every set we can get excluding the test_set
train_set = featuresets[1800:]

## CHANGE CLASSIFER HERE ##
# naive bayes classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Decision tree classifier
#classifier = nltk.DecisionTreeClassifier.train(train_set)
#print(classifier.pseudocode(depth=4)) # shows the beginning of the tree (might be slow)

# MaxentClassifier (dosen't give neg statistics)
# classifier = nltk.MaxentClassifier.train(train_set)


In [20]:

## data-struct for precision and recall ##

# known labels "golden labels" - correct labels from corpus
refsets = collections.defaultdict(set)

# classifier output using test_set - aiming to get the same labels as refset
testsets = collections.defaultdict(set)

# build data-structs & report document pos/neg
for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = classifier.classify(feats) 
    testsets[observed].add(i)
    print(f"Document {i+1}: is {observed}")


# use classifier on a single document like so:
# result = classifier.classify(some_document)

Document 1: is neg
Document 2: is neg
Document 3: is neg
Document 4: is pos
Document 5: is neg
Document 6: is pos
Document 7: is neg
Document 8: is neg
Document 9: is pos
Document 10: is pos
Document 11: is pos
Document 12: is pos
Document 13: is neg
Document 14: is neg
Document 15: is pos
Document 16: is neg
Document 17: is neg
Document 18: is pos
Document 19: is neg
Document 20: is neg
Document 21: is pos
Document 22: is pos
Document 23: is pos
Document 24: is pos
Document 25: is pos
Document 26: is pos
Document 27: is neg
Document 28: is neg
Document 29: is pos
Document 30: is pos
Document 31: is neg
Document 32: is neg
Document 33: is neg
Document 34: is neg
Document 35: is neg
Document 36: is pos
Document 37: is neg
Document 38: is neg
Document 39: is neg
Document 40: is pos
Document 41: is pos
Document 42: is neg
Document 43: is pos
Document 44: is neg
Document 45: is neg
Document 46: is pos
Document 47: is neg
Document 48: is neg
Document 49: is pos
Document 50: is pos
Document 

<h1>Example Doc output:</h1><br>
...<br>
Document 36: is pos<br>
Document 37: is neg<br>
Document 38: is pos<br>
Document 39: is neg<br>
Document 40: is pos<br>
Document 41: is neg<br>
Document 42: is neg<br>
Document 43: is pos<br>
Document 44: is pos<br>
...<br>

In [22]:
## Report positive class ##
print(f"Positive class statistics: ")

# report precision
pSum = precision(refsets['pos'], testsets['pos'])
print( f"Precision: {pSum*100:.2f}%")

# report recall
rSum = recall(refsets['pos'], testsets['pos'])
print( f"Recall: {rSum*100:.2f}%")

# report accuracy
print(f"Accuracy: {nltk.classify.accuracy(classifier, test_set)*100:.2f}%")

# report F-score
fScore = (2*(pSum*rSum)/(pSum+rSum))
print( f"F-score: {fScore*100:.2f}%")

Positive class statistics: 
Precision: 79.57%
Recall: 77.08%
Accuracy: 79.50%
F-score: 78.31%


In [23]:
## Report negative class ##
print(f"Negative class statistics: ")

# report precision
pSum = precision(refsets['neg'], testsets['neg'])
print( f"Precision: {pSum*100:.2f}%")

# report recall
rSum = recall(refsets['neg'], testsets['neg'])
print( f"Recall: {rSum*100:.2f}%")

# report accuracy
print(f"Accuracy: {nltk.classify.accuracy(classifier, test_set)*100:.2f}%")

# report F-score
fScore = (2*(pSum*rSum)/(pSum+rSum))
print( f"F-score: {fScore*100:.2f}%")

Negative class statistics: 
Precision: 79.44%
Recall: 81.73%
Accuracy: 79.50%
F-score: 80.57%


<h1>Example statistics: Naive bayes</h1><br>
positive class statistics:<br>
Precision: 80.85%<br>
Recall: 76.00%<br>
F-score: 78.35%<br>
Model Accuracy: 79.00%<br>
<br>
Negative class statistics: <br>
Precision: 77.36%<br>
Recall: 82.00%<br>
Model Accuracy: 79.00%<br>
F-score: 79.61%<br>
<br>
<br>
<h1>Example statistics: Decision Tree Classifier </h1><br>
Positive class statistics: <br>
Precision: 50.96%<br>
Recall: 53.00%<br>
Model Accuracy: 51.00%<br>
F-score: 51.96%<br>
<br>
Negative class statistics:<br>
Precision: 51.04%<br>
Recall: 49.00%<br>
Model Accuracy: 51.00%<br>
F-score: 50.00%<br>

In [24]:
# Extra Fun: highest value features in known set
classifier.show_most_informative_features(20)

Most Informative Features
               memorable = True              pos : neg    =      9.7 : 1.0
                 leaving = True              pos : neg    =      8.5 : 1.0
                   bland = True              neg : pos    =      8.1 : 1.0
                  normal = True              pos : neg    =      7.9 : 1.0
               substance = True              neg : pos    =      7.4 : 1.0
                attempts = True              neg : pos    =      6.2 : 1.0
                   worse = True              neg : pos    =      6.2 : 1.0
                   blame = True              neg : pos    =      6.0 : 1.0
                  jungle = True              neg : pos    =      6.0 : 1.0
                 routine = True              neg : pos    =      6.0 : 1.0
                    sick = True              neg : pos    =      6.0 : 1.0
                 subplot = True              neg : pos    =      6.0 : 1.0
                  allows = True              pos : neg    =      6.0 : 1.0