## <center> <h1> CS 401: Natural Language Processing</h1></center>
### <center> <h1> Project 3 </h1></center>
#### <center> <h1> Hannah & Anamay </h1></center>

### Question 1

In [1]:
import nltk
#nltk.download('movie_reviews')
#nltk.download('stopwords')
from nltk.corpus import movie_reviews, stopwords
from nltk.tokenize import sent_tokenize, word_tokenize # implicitly calls punkt
import time
import math
import string

In [2]:
documents = [(movie_reviews.raw(fileid), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
classes = movie_reviews.categories()

In [3]:
trainingSet = documents[100:900] + documents[1100:1900]
devSet = documents[900:1000] + documents[1900:]
testSet = documents[:100] + documents[1000:1100]

In [4]:
def wordCount(documents,cutoff):
    """
        Helper function that counts the frequencies of 
        unique words (vocabulary)
        Parameters:
         documents: list of all words in data
            cutoff: minimum word frequency to keep word
                    in the vocabulary
        Return value:
            dictionary of word frequencies
    """
    worder = {}
    worder2 = {}
    allwords = []
    stop = stopwords.words("english")
    stop = stop[:-43]  #removes conjunctions with negative connotations from NLTK's stopwords list
    stop.remove('not')
    stop.remove('nor')
    stop.remove('no')
    filters = stop+list(string.punctuation)
    if type(documents) == list:
        for d in documents:
            words = word_tokenize(d[0].lower().strip(),preserve_line=False)
            for word in words:
                if word not in filters: 
                    if word not in worder:
                        worder[word] = 0
                    worder[word] += 1
        for word in worder:
            if worder[word] > cutoff:
                worder2[word] = worder[word]
        return worder2
    else:
        words = word_tokenize(documents.lower().strip(),preserve_line=False)
        for word in words:
            if word not in filters:
                if word not in worder:
                    worder[word] = 0
                worder[word] += 1
                allwords.append(word)
        for word in worder:
            if worder[word] > cutoff:
                worder2[word] = worder[word]
        return (worder2, allwords)

In [5]:
def train(trainingSet,classes,cutoff):
    """
    Training function. 
    Trains the classifier by calculating log prior and likelihood
    Parameters:
        trainingSet: set of training documents
        classes: list of classes 
    Return value:
        logPrior: dict. of prior probabilities
        logLikelihood: dict. of probabilities
        Vocab: dict of words and frequencies
    """
    x = time.time()
    Vocab = wordCount(trainingSet,cutoff)
    class_freq = {}
    big_doc = {}
    log_prior = {}
    loglikelihood = {}
    train_size = len(trainingSet)
    for c in classes:
        class_freq[c] = 0
        big_doc[c] = """"""
        for each in trainingSet:
            if each[1] == c:
                class_freq[c] += 1
                big_doc[c] += each[0]
        log_prior[c] = math.log((class_freq[c]/train_size),2)
        word_counts,allwords = wordCount(big_doc[c],cutoff)
        for w in Vocab:
            if w not in word_counts:
                word_counts[w] = 0
            loglikelihood[w,c] = math.log((word_counts[w]+1)/(len(allwords)+len(Vocab)),2)
    print("Run-time: ",time.time()-x,"seconds")
    return log_prior,loglikelihood,Vocab

In [6]:
logPrior,logLikelihood,V = train(trainingSet,classes,8) # optimal cutoff found. See answer to Q3

Run-time:  34.66048884391785 seconds


In [7]:
logLikelihood

 ('spoilers', 'neg'): -14.260194280541203,
 ('included', 'neg'): -13.466645158008628,
 ('review', 'neg'): -11.079622034899382,
 ("n't", 'neg'): -6.742393684636767,
 ('really', 'neg'): -8.91205630633099,
 ('make', 'neg'): -8.829462974157213,
 ('much', 'neg'): -8.51066001287194,
 ('difference', 'neg'): -13.359729954092117,
 ('deep', 'neg'): -11.412197373986253,
 ('impact', 'neg'): -12.582122375428563,
 ('begins', 'neg'): -11.05856041937155,
 ('official', 'neg'): -13.77476745337096,
 ('summer', 'neg'): -11.01733775664504,
 ('movie', 'neg'): -6.870168669270432,
 ('season', 'neg'): -13.707653257512423,
 ('also', 'neg'): -8.91205630633099,
 ('brings', 'neg'): -12.212888565762846,
 ('back', 'neg'): -9.635703415633408,
 ('memories', 'neg'): -13.582122375428565,
 ('1997', 'neg'): -12.881682657287472,
 ('remember', 'neg'): -11.919157362706134,
 ('dante', 'neg'): -13.997159874707409,
 ("'s", 'neg'): -5.415122462688867,
 ('peak', 'neg'): -14.079622034899382,
 ('came', 'neg'): -11.740820121447623,


In [7]:
def test(testDoc, logPrior, logLikelihood, classes, vocab):
    """
    Test Function.
    Tests the classifier using the logPrior and logLikelihood calculated by the training function
    Parameters:
        testdoc: doc to be classified
        logPrior: dict. of prior probabilities, return value of training func
        logLikelihood: dict. of probabilities, return value of traing func
        classes: list of classes
        vocab: dict of words and frequencies, return value of training func
    Return value:
        tuple of class picked by classifier and true class of the testDoc
    """
    likely = {}
    words = word_tokenize(testDoc[0].lower().strip(),preserve_line=False)
    for c in classes:
        likely[c] = logPrior[c]
        for word in words:
            if word in vocab:
                likely[c] += logLikelihood[word,c]
    if likely[classes[0]] > likely[classes[1]]:
        pick = classes[0]
    else:
        pick = classes[1]
    return (pick,testDoc[1])

### Question 2

In [8]:
def testCorpus(testSet, logPrior, logLikelihood, classes, vocab):
    """
    Test Corpus
    Repeatedly calls the test function on a set of test documents and calculates
    the recall, precision, and accuracy
    Parameters:
        testSet: set of doc 
        logPrior: dict. of prior probabilities, return value of training func
        logLikelihood: dict. of probabilities, return value of traing func
        classes: list of classes
        vocab: dict of words and frequencies, return value of training func
    Return value:
        Recall: recall value
        Precision: precision value
        Accuracy: accuracy value
        matrix: pos/neg values
    """
    matrix = {}
    matrix['TP'] = 0 # True positives
    matrix['FP'] = 0 # False positives
    matrix['FN'] = 0 # False negatives
    matrix['TN'] = 0 # True negatives
    for testDoc in testSet:
        result = test(testDoc,logPrior,logLikelihood,classes,vocab)
        if result[0]==result[1]: # if both are the same:
            if result[0]=="neg": # both negative
                matrix['TN'] += 1
            else:                # both positive
                matrix['TP'] += 1
        else:                    # if different:
            if result[0]=="neg":# system negative, gold positive
                matrix['FN'] += 1
            else:                # system positive, gold negative
                matrix['FP'] += 1
    Recall = matrix['TP']/(matrix['TP']+matrix['FN'])
    Precision = matrix['TP']/(matrix['TP']+matrix['FP'])
    Accuracy = (matrix['TP']+matrix['TN'])/(matrix['TP']+matrix['TN']+matrix['FP']+matrix['FN'])
    return Recall,Precision,Accuracy,matrix

In [9]:
recall,precision,acc,metric = testCorpus(devSet,logPrior,logLikelihood,classes,V)
print("Dev set performance.")
print("Recall: ",recall)
print("Precision: ",precision)
print("Accuracy: ",acc)
print("Final Matrix: ",metric)

Dev set performance.
Recall:  0.81
Precision:  0.7714285714285715
Accuracy:  0.785
Final Matrix:  {'TP': 81, 'FP': 24, 'FN': 19, 'TN': 76}


In [10]:
recall,precision,acc,metric = testCorpus(testSet,logPrior,logLikelihood,classes,V)
print("Test set performance.")
print("Recall: ",recall)
print("Precision: ",precision)
print("Accuracy: ",acc)
print("Final Matrix: ",metric)

Test set performance.
Recall:  0.82
Precision:  0.7454545454545455
Accuracy:  0.77
Final Matrix:  {'TP': 82, 'FP': 28, 'FN': 18, 'TN': 72}


In [None]:
# Grid search for best value of word frequency cutoff.
#stats = []
#for i in range(0,30):
#    logPrior,logLikelihood,V = train(trainingSet,classes,i)
#    recall,precision,acc,metric = testCorpus(testSet,logPrior,logLikelihood,classes,V)
#    print("Iteration number: ",i)
#    print("Recall: ",recall)
#    print("Precision: ",precision)
#    print("Accuracy: ",acc)
#    print("Final Matrix: ",metric)
#    stats.append((recall,precision,acc,metric))

### Question 3

   In order to train our naive bayes classifier we start by computing the frequencies of words in the set of training documents. We also added an additional parameter `cutoff` that removes words from the vocabulary that occur less than a certain value (8)in the training data. Then using those frequencies, our function computes the log prior probabilites and the log likelihoods for each word in the document for each class. The test function analyzes a document and determines the most probable class for it. For our baseline, with the cutoff set to 0 and with no word pre-processing, we got 77% recall, precision and accuracy. 

   In order to increase the performance of our classifier we decided to remove the redundant high-frequency words that were possibly skewing the loglikelihood (in the denominator). One such group of words are stop words which are commonly used words that do not indicate sentiment. We used nltk's built in set of stop words, though we removed the 'not' conjunctions as those could potentially indicate negative sentiment. We also removed punctuations as they do not indicate sentiment either. However, taking out the stop words had virtually no effect on the performance. Cutoffs, on the other hand, had an interesting effect on the performance; recall tended to go up as cutoff values were increased while precision went down (accuracy stayed about the same). A higher recall implies lower false negatives and higher false positives (lower precision). This recall-precision trade-off differs across the task at hand. For the movie reviews classification task, we decided that classifying positive reviews as negative was worse than the other way around so we chose to optimize recall over precision (minimizing false negatives). After performing a grid search for different cutoff values, we found out that cutoff=8 (i.e. including only words that occur more than 8 times) maximized recall while maintaining a reasonable precision and accuracy.
   
   In order to check our results, we also built a Naive Bayes classifier using python's sklearn (see below). The classifier gave a maximum accuracy of 71% which is lower compared to our classifier's 77%.

In [11]:
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
from nltk.corpus import movie_reviews
vect = CountVectorizer(min_df = 10, stop_words = 'english')
classes = movie_reviews.categories()   # ['pos', 'neg']
text = []
targets = []
for category in classes:
    for fileid in movie_reviews.fileids(category):
        text.append(movie_reviews.raw(fileid))
        targets.append(category)
text_train = text[:800] + text[1000:1800]
targets_train = targets[:800] + targets[1000:1800]
vect.fit(text_train)  
text_test = text[800:1000] + text[1800:]
targets_test = targets[800:1000] + targets[1800:]
X_train = vect.transform(text_train)
X_test = vect.transform(text_test)
gnb = GaussianNB()
y_pred = gnb.fit(X_train.toarray(),targets_train)
print(y_pred.score(X_test.toarray(),targets_test))

0.705
