In [None]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import re
import unicodedata
import os
from sklearn.metrics import f1_score
import random

'''
# Use this to download parts of speech tagging model
nltk.download('averaged_perceptron_tagger')
'''

In [2]:
# function to extract the adjectives from the reviews
def get_adjectives(words):
    tagged_words = nltk.pos_tag(words)
    stopwords = nltk.corpus.stopwords.words('english')
    words=[]
    for i in range(len(tagged_words)):
        if(tagged_words[i][0] not in stopwords and tagged_words[i][1] == 'JJ'): # removing stopwords and considering only adjectives
            words.append(tagged_words[i][0])
    return words

# normalize, remove punctuation, lemmatize and tokenize
def basic_clean(text):
    wnl = nltk.stem.WordNetLemmatizer()
    text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    words = get_adjectives(words)
    return [wnl.lemmatize(word) for word in words]

In [3]:
# Reading the reviews from the files
pos_files = os.listdir('aclImdb/train/pos')
neg_files = os.listdir('aclImdb/train/neg')
pos_reviews = []
for f in pos_files:
    pos_reviews.append(basic_clean(open('aclImdb/train/pos/' + f, 'r', encoding="utf8").read()))   
neg_reviews = []
for f in neg_files:
    neg_reviews.append(basic_clean(open('aclImdb/train/neg/' + f, 'r', encoding="utf8").read()))

In [4]:
# Accumulating all possible adjectives into list vocab
vocab = []
for review in pos_reviews:
    for word in review:    
        vocab.append(word)        
for review in neg_reviews:
    for word in review:    
        vocab.append(word)
print(len(vocab))

515683


In [5]:
# creating a frequency distribution of the combined vocabulary of adjectives
distribution = nltk.FreqDist(vocab)

# taking the 2000 most frequent adjectives
most_freq = distribution.most_common(2000)

# creating a list of just the most frequent adjectives from the frequency distribution
most_frequent_adjectives = []
for word in most_freq:
    most_frequent_adjectives.append(word[0])
    
print(len(most_frequent_adjectives))

2000


In [6]:
# Building features for the naive bayes classifier
def get_features(review):
    features = {}
    for w in most_frequent_adjectives:
        features[w] = (w in review)
    return features


'''
Preparing training data.
The training data is a list of tuples. Every tuple has a dictionary as it's first element and either 'pos'
or 'neg' as it's second element.
The dictionary is the features i.e. the dictionary is of the form {'boring': True, 'bad': True, 'fun': False, ...... }
showing which all words from the vocabulary are present in that particular review.
'''
train_set = []
for i in range(len(pos_reviews)):
    train_set.append((get_features(pos_reviews[i]), 'pos'))
for i in range(len(neg_reviews)):
    train_set.append((get_features(neg_reviews[i]), 'neg'))
    
random.shuffle(train_set)

In [7]:
# Fitting a Naive Bayes Classifier on the data
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [8]:
# reading testing data
pos_files_test = os.listdir('aclImdb/test/pos')
neg_files_test = os.listdir('aclImdb/test/neg')
pos_reviews_test = []
for f in pos_files_test:
    pos_reviews_test.append(basic_clean(open('aclImdb/test/pos/' + f, 'r', encoding="utf8").read()))   
neg_reviews_test = []
for f in neg_files_test:
    neg_reviews_test.append(basic_clean(open('aclImdb/test/neg/' + f, 'r', encoding="utf8").read()))
    
# preparing testing data
test_set = []
for i in range(len(pos_reviews_test)):
    test_set.append((get_features(pos_reviews_test[i]), 'pos'))
for i in range(len(neg_reviews_test)):
    test_set.append((get_features(neg_reviews_test[i]), 'neg'))

random.shuffle(test_set)

In [None]:
# calculating accuracy
#print("Classifier accuracy:", (nltk.classify.accuracy(classifier, test_set)) * 100)

In [9]:
classifier.show_most_informative_features(20)

Most Informative Features
                     uwe = True              neg : pos    =     33.7 : 1.0
             unwatchable = True              neg : pos    =     26.7 : 1.0
                 awfulbr = True              neg : pos    =     23.7 : 1.0
              incoherent = True              neg : pos    =     19.7 : 1.0
                  poorly = True              neg : pos    =     19.0 : 1.0
             influential = True              pos : neg    =     14.6 : 1.0
                  flimsy = True              neg : pos    =     13.8 : 1.0
                 unfunny = True              neg : pos    =     13.6 : 1.0
               redeeming = True              neg : pos    =     13.2 : 1.0
             astonishing = True              pos : neg    =     13.0 : 1.0
                     bug = True              pos : neg    =     13.0 : 1.0
                   worst = True              neg : pos    =     12.6 : 1.0
                  seagal = True              neg : pos    =     12.6 : 1.0

In [10]:
# finding predictions on test_set
preds = []
ground_truth = []
for i in range(len(test_set)):
    preds.append(classifier.classify(test_set[i][0]))
    ground_truth.append(test_set[i][1])

In [11]:
# function to calculate the goodness figures
def goodness_figures(ground_truth, preds):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    ACC = 0
    for i in range(len(preds)): 
        if ground_truth[i]==preds[i]=='pos':
           TP += 1
           ACC +=1
        if preds[i]=='pos' and ground_truth[i]=='neg':
           FP += 1
        if ground_truth[i]==preds[i]=='neg':
           TN += 1
           ACC +=1
        if preds[i]=='neg' and ground_truth[i]=='pos':
           FN += 1

    return(TP, FP, TN, FN, ACC/len(preds)*100)

In [12]:
scores = goodness_figures(ground_truth, preds)
print("True Positives: ", scores[0])
print("False Positives: ", scores[1])
print("True Negatives: ", scores[2])
print("False Negatives: ", scores[3])
print("Accuracy: ", scores[4], "%")

True Positives:  9678
False Positives:  2107
True Negatives:  10393
False Negatives:  2822
Accuracy:  80.284 %
