## Import Libraries

In [52]:
import collections
import nltk.classify.util, nltk.metrics
from nltk import precision, recall
from nltk.classify import NaiveBayesClassifier
from nltk.classify import DecisionTreeClassifier
from nltk.corpus import CategorizedPlaintextCorpusReader
from sklearn import svm
from sklearn.svm import LinearSVC
from nltk import precision
import string
from tabulate import tabulate         
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

## Load Data

In [34]:
train_path='aclImdb/train'
train_data=CategorizedPlaintextCorpusReader(train_path,r'(pos|neg)/.*\.txt',cat_pattern=r'(pos|neg)/.*\.txt')
test_path='aclImdb/test'
test_data=CategorizedPlaintextCorpusReader(test_path,r'(pos|neg)/.*\.txt',cat_pattern=r'(pos|neg)/.*\.txt')

In [35]:
negative_train_id = train_data.fileids('neg')
positive_train_id = train_data.fileids('pos')
negative_test_id = test_data.fileids('neg')
positive_test_id = test_data.fileids('pos')

negative_train = [(word_feats(train_data.words(fileids=[f])), 'neg') for f in negative_train_id]
positive_train = [(word_feats(train_data.words(fileids=[f])), 'pos') for f in positive_train_id]
negative_test = [(word_feats(test_data.words(fileids=[f])), 'neg') for f in negative_test_id]
positive_test = [(word_feats(test_data.words(fileids=[f])), 'pos') for f in positive_test_id]

## Train Model

In [36]:
train_data = positive_train + negative_train

In [37]:
test_data = positive_test + negative_test

### Naive Bayes Classification

In [38]:
Naive_classifier = NaiveBayesClassifier.train(train_data)
refsets = collections.defaultdict(set)
testsets_Naive = collections.defaultdict(set)

In [47]:
for i, (text, label) in enumerate(test_data):
        refsets[label].add(i)           
        observed_Naive = Naive_classifier.classify(text)
        testsets_Naive[observed_Naive].add(i)

### Accuracy and Precision

In [54]:
accuracy = nltk.classify.util.accuracy(Naive_classifier, test_data)  
print("Accuracy of Naive Classifier Model: %0.2f" % (accuracy*100) + "%")
positive_precision = precision(refsets['pos'], testsets_Naive['pos'])
print("Precision of Positive Review of Naive Classifier Model: %0.2f" % (positive_precision*100) + "%")
positive_recall = recall(refsets['pos'], testsets_Naive['pos'])
negative_precision = precision(refsets['neg'], testsets_Naive['neg'])
print("Precision of Negative Review of Naive Classifier Model: %0.2f" % (negative_precision*100) + "%")
negative_recall = recall(refsets['neg'], testsets_Naive['neg'])

Accuracy of Naive Classifier Model: 82.66%
Precision of Positive Review of Naive Classifier Model: 86.71%
Precision of Negative Review of Naive Classifier Model: 79.41%


In [56]:
Naive_classifier.show_most_informative_features(10)

Most Informative Features
                   Avoid = True              neg : pos    =     97.0 : 1.0
                    Boll = True              neg : pos    =     37.7 : 1.0
                     Uwe = True              neg : pos    =     36.3 : 1.0
                 stinker = True              neg : pos    =     28.1 : 1.0
                   WORST = True              neg : pos    =     27.8 : 1.0
                  Paulie = True              pos : neg    =     24.3 : 1.0
               awfulness = True              neg : pos    =     23.7 : 1.0
             excellently = True              pos : neg    =     22.2 : 1.0
                  Capote = True              pos : neg    =     21.7 : 1.0
             unwatchable = True              neg : pos    =     21.7 : 1.0


### SVM Model


In [68]:
classifier = nltk.classify.SklearnClassifier(LinearSVC(max_iter=100000))
SVM_classifier = classifier.train(train_data)
refsets = collections.defaultdict(set)
SVM_testset = collections.defaultdict(set)

In [72]:
for i, (text, label) in enumerate(test_data):
        refsets[label].add(i)           
        SVM_observe = classifier.classify(text)
        SVM_testset[SVM_observe].add(i)

In [74]:
accuracy = nltk.classify.util.accuracy(classifier, test_data)  
print("Accuracy of SVM Model: %0.2f" % (accuracy*100) + "%")
positive_precision = precision(refsets['pos'], SVM_testset['pos'])
print("Precision of Positive Review of SVM Model: %0.2f" % (positive_precision*100) + "%")
positive_recall = recall(refsets['pos'], SVM_testset['pos'])
negative_precision = precision(refsets['neg'], SVM_testset['neg'])
print(negative_precision)
negative_recall = recall(refsets['neg'], SVM_testset['neg'])

Accuracy of SVM Model: 85.78%
Precision of Positive Review of SVM Model: 50.00%
None


## Decision Tree Model

In [17]:
train_negcutoff = len(train_negfeats)*1/100
train_poscutoff = len(train_posfeats)*1/100
trainfeats_Decision = train_negfeats[:train_negcutoff] + train_posfeats[:train_poscutoff]
DecisionTree_classifier = DecisionTreeClassifier.train(trainfeats_Decision)
refsets = collections.defaultdict(set)
testsets_Decision = collections.defaultdict(set)

TypeError: slice indices must be integers or None or have an __index__ method

In [None]:
for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)           
        observed_Decision = DecisionTree_classifier.classify(feats)
        testsets_Decision[observed_Decision].add(i)

In [None]:
accuracy3 = nltk.classify.util.accuracy(DecisionTree_classifier, testfeats)  
pos_precision3 = nltk.metrics.precision(refsets['pos'], testsets_Decision['pos'])
pos_recall3 = nltk.metrics.recall(refsets['pos'], testsets_Decision['pos'])
neg_precision3 = nltk.metrics.precision(refsets['neg'], testsets_Decision['neg'])
neg_recall3 = nltk.metrics.recall(refsets['neg'], testsets_Decision['neg'])

In [10]:
def evaluate_classifier_Decision(featx):
    train_negids = train.fileids('neg')
    train_posids = train.fileids('pos')
    test_negids = test.fileids('neg')
    test_posids = test.fileids('pos')
    train_negfeats = [(featx(train.words(fileids=[f])), 'neg') for f in train_negids]
    train_posfeats = [(featx(train.words(fileids=[f])), 'pos') for f in train_posids]
    test_negfeats = [(featx(test.words(fileids=[f])), 'neg') for f in test_negids]
    test_posfeats = [(featx(test.words(fileids=[f])), 'pos') for f in test_posids]
    trainfeats = train_negfeats + train_posfeats
    testfeats = test_negfeats + test_posfeats

    train_negcutoff = len(train_negfeats)*1/100
    train_poscutoff = len(train_posfeats)*1/100
    trainfeats_Decision = train_negfeats[:train_negcutoff] + train_posfeats[:train_poscutoff]
    DecisionTree_classifier = DecisionTreeClassifier.train(trainfeats_Decision)
    refsets = collections.defaultdict(set)
    testsets_Decision = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)           
            observed_Decision = DecisionTree_classifier.classify(feats)
            testsets_Decision[observed_Decision].add(i)

    accuracy3 = nltk.classify.util.accuracy(DecisionTree_classifier, testfeats)  
    pos_precision3 = nltk.metrics.precision(refsets['pos'], testsets_Decision['pos'])
    pos_recall3 = nltk.metrics.recall(refsets['pos'], testsets_Decision['pos'])
    neg_precision3 = nltk.metrics.precision(refsets['neg'], testsets_Decision['neg'])
    neg_recall3 = nltk.metrics.recall(refsets['neg'], testsets_Decision['neg'])

    return(['DecisionTree',accuracy3,pos_precision3,pos_recall3,neg_precision3,neg_recall3])


In [None]:
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    words_nopunc = [word for word in words if word not in string.punctuation]
    bigram_finder = BigramCollocationFinder.from_words(words_nopunc)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words_nopunc, bigrams)])


In [None]:
table2 = []
table2.append(evaluate_classifier_Naive(bigram_word_feats))
table2.append(evaluate_classifier_SVM(bigram_word_feats))
table2.append(evaluate_classifier_Decision(bigram_word_feats))
 
print('Bigram word features:')
print(tabulate(table2, headers=["Classifier","Accuracy","Positive precision", "Positive recall", "Negative precision", "Negative recall"]))