In [162]:
import sys
import time
import math
import copy
import random
import string
import glob
import numpy as np
import nltk.sentiment.util
from tqdm import tqdm
from os import listdir
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.metrics import *



# nltk.download('stopwords')

In [163]:
def loadReviewNames(posDir, negDir):
    positive = glob.glob(posDir)
    negative = glob.glob(negDir)
#     print(positive)
#     print(negative)
    return positive, negative

In [164]:
#loads dataset in a way that will work with scikit
def getPosNegReviews(directory):
    positive, negative = loadReviewNames(directory + '/pos/*',directory + '/neg/*')
    random.shuffle(positive)
    boundaryTrain = math.floor(0.8 * len(positive))
    trainPos = positive[:boundaryTrain]
    random.shuffle(negative)
    trainNeg = negative[:boundaryTrain]
    
    testPos = positive[boundaryTrain:]
    testNeg = negative[boundaryTrain:]
    return trainPos,trainNeg,testPos,testNeg

In [165]:
def loadDir(name):
    # Loads the files in the folder and returns a list of lists of words from the text in each file
    if Stemming:
        porter_stemmer = PorterStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    data = []
    count = 0
    for f in tqdm(listdir(name)):
        fullname = name+f
        text = []
        with open(fullname, 'rb') as f:
            for line in f:
                if Lowercase:
                    line = line.decode(errors='ignore').lower()
                    text += tokenizer.tokenize(line)
                else:
                    text += tokenizer.tokenize(line.decode(errors='ignore'))
        if Stemming:
            for i in range(len(text)):
#                 if text[i] in bad_words:
#                     continue
                text[i] = porter_stemmer.stem(text[i])
        data.append(text)
        count = count + 1
    return data

In [166]:
#loads dataset in a way that will work with my unigram Naive bayes implementation
def loadDatasetNB(directory):
    positive = loadDir(directory + '/pos/')
    negative = loadDir(directory + '/neg/')
    random.shuffle(positive)
    boundaryTrain = math.floor(0.8 * len(positive))
    trainPos = positive[:boundaryTrain]
    random.shuffle(negative)
    trainNeg = negative[:boundaryTrain]
    combinedTrain = trainPos + trainNeg
    length = len(trainPos) + len(trainNeg)
    labelsTrain = len(trainNeg) * [1] + len(trainNeg) * [0]
    labelsTrain = np.array(labelsTrain)

    testPos = positive[boundaryTrain:]
    testNeg = negative[boundaryTrain:]
    combinedTest = testPos + testNeg
    labelsTest = len(testPos) * [1] + len(testNeg) * [0]
    labelsTest = np.array(labelsTest)
    return combinedTrain, labelsTrain, combinedTest, labelsTest

In [167]:
#helper function for unigram Naive bayes implementation
def bagOfWordsNB(train_set, train_labels):
    stops = stopwords.words('english') + list(string.punctuation)
    mydict = {}
    smoothing_parameter = 0.034
    posV = 0
    negV = 0
    totalposwords = 0
    totalnegwords = 0

#     start = time.process_time()

    #create bag of words and number of occurences
    count = 0
    if Stop:
        for x in train_set:
            rating = train_labels[count]
            count += 1
            if(rating):
                for y in x:
                    if y not in mydict and y not in stops:
                        mydict[y] = [1,0] #default [1 pos, 0 neg]
                        posV += 1
                        totalposwords += 1
                    elif y not in stops:
                        if mydict[y][0] == 0:
                            posV += 1
                        mydict[y][0] += 1
                        totalposwords += 1
            else:
                for y in x:
                    if y not in mydict and y not in stops:
                        mydict[y] = [0,1] #default [0 pos, 1 neg]
                        negV += 1
                        totalnegwords += 1
                    elif y not in stops:
                        if mydict[y][1] == 0:
                            negV += 1
                        mydict[y][1] += 1
                        totalnegwords += 1
    else:
        for x in train_set:
            rating = train_labels[count]
            count += 1
            if(rating):
                for y in x:
                    if y not in mydict:
                        mydict[y] = [1,0] #default [1 pos, 0 neg]
                        posV += 1
                        totalposwords += 1
                    else:
                        if mydict[y][0] == 0:
                            posV += 1
                        mydict[y][0] += 1
                        totalposwords += 1
            else:
                for y in x:
                    if y not in mydict:
                        mydict[y] = [0,1] #default [0 pos, 1 neg]
                        negV += 1
                        totalnegwords += 1
                    else:
                        if mydict[y][1] == 0:
                            negV += 1
                        mydict[y][1] += 1
                        totalnegwords += 1
#     print("review count is: ", count)
#     print("posV", posV)
#     print("negV", negV)
#     print("total word count is:", totalposwords + totalnegwords)
    BOW = mydict, posV, negV, totalposwords, totalnegwords
    return BOW

In [168]:
def naiveBayes(train_set, train_labels, dev_set):
    #Baseline#
    # return predicted labels of development set
    # print("not even started yet")
    smoothing_parameter = 0.034

#     start = time.process_time()

#     print("Going through train set took: ", time.process_time() - start)
    mydict, posV, negV, totalposwords, totalnegwords = bagOfWordsNB(train_set, train_labels)
    
    
    #come up with the bag of words unigram model
    probWordPos = {}
    probWordNeg = {}

    for x in mydict:
        #use laplace smoothing
        # count(W) + a / n + a * (V+1)
        # n = number of words in our training data
        # count(W) = number of times W appeared in training data
        # α is a tuning constant between 0 and 1 (typically small)
        # V = number of word TYPES seen in training data
        if(Smoothing):
            probWordPos[x] = math.log((mydict[x][0] + smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
            probWordNeg[x] = math.log((mydict[x][1] + smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))
        else:
            probWordPos[x] = math.log((mydict[x][0]) / (totalposwords))
            probWordNeg[x] = math.log((mydict[x][1]) / (totalnegwords))
#     start = time.process_time()

    # dev set
    predictions = []
    for x in range(len(dev_set)):
        chancePos = 0
        chanceNeg = 0 
        if(Prior):
            chancePos += math.log(posPrior)
            chanceNeg += math.log(1-posPrior)
        for y in range(len(dev_set[x])):
            if dev_set[x][y] in mydict:
                chancePos += probWordPos[dev_set[x][y]]
                chanceNeg += probWordNeg[dev_set[x][y]]
            # else:
                # chancePos += math.log((smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
                # chanceNeg += math.log((smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))
        if(chancePos > chanceNeg):
            predictions.append(1)
        else:
            predictions.append(0)
#     print("devset time took:", time.process_time() - start)
    return predictions

In [169]:
def compute_accuraciesNB(predictedLabels, dev_set, dev_labels):
    yhats = predictedLabels
    accuracy = np.mean(yhats == dev_labels)
    tp = np.sum([yhats[i] == dev_labels[i] and yhats[i] == 1 for i in range(len(yhats))])
    precision = tp / np.sum([yhats[i] == 1 for i in range(len(yhats))])
    recall = tp / (np.sum([yhats[i] != dev_labels[i] and yhats[i] == 0 for i in range(len(yhats))]) + tp)
    f1 = 2 * (precision * recall) / (precision + recall)
    return accuracy, f1, precision, recall

In [170]:
def compute_accuraciesSK(predictedLabels):
    dev_labels = [1]*200+[0]*200
    yhats = predictedLabels
    accuracy = np.mean(yhats == dev_labels)
    tp = np.sum([yhats[i] == dev_labels[i] and yhats[i] == 1 for i in range(len(yhats))])
    precision = tp / np.sum([yhats[i] == 1 for i in range(len(yhats))])
    recall = tp / (np.sum([yhats[i] != dev_labels[i] and yhats[i] == 0 for i in range(len(yhats))]) + tp)
    f1 = 2 * (precision * recall) / (precision + recall)
    return accuracy, f1, precision, recall

In [171]:
#### scikit pre processing methods ####

In [172]:
def bagOfWordsSK(reviews):
    word_bag = {}
    stops = stopwords.words('english') + list(string.punctuation)
    for review in reviews:
        with open(review, 'r') as f:
            line = f.read()
            line = line.split(" ")
            if Stop:
                line = [word for word in line if word not in set(stops)]
            if Alpha:
                line = [word for word in line if word.isalpha()]
            for word in line:
                word_bag[word] = word_bag.get(word, 0) + 1
    return word_bag

In [173]:
def trimBags(pWbag, nWbag):
    posList = {}
    negList = {}
    print("prefilter")
    print(len(pWbag))
    print(len(nWbag))
    #force words to only exist in either positive or negative bags
    if(Unique):
        for key in nWbag.keys():
            if key in pWbag.keys():
                posFreq = pWbag[key]
                negFreq = nWbag[key]
                if int(posFreq) >= int(negFreq):
                    posList[key] = pWbag[key]
                elif int(posFreq) < int(negFreq):
                    negList[key] = nWbag[key]
            else:
                negList[key] = nWbag[key]
        for key in pWbag.keys():
            if key not in nWbag.keys():
                posList[key] = pWbag[key]
    else:
        posList = pWbag
        negList = nWbag
    print("first filter")
    print(len(posList))
    print(len(negList))
    #determine the boundary size
    max_length=min(len(negList),len(posList))
    print("max_length")
    print(max_length)
    #sort the keys of the map into a list, sorting to get most frequently used words
    sortedNegList = sorted(negList, key=negList.get, reverse=True)[:max_length]
    sortedPosList = sorted(posList, key=posList.get, reverse=True)[:max_length]
    print("sorted max filter")
    print(len(sortedNegList))
    print(len(sortedPosList))
    #turn the sorted list into a usable mapping again
    new_neg= {k:negList[k] for k in sortedNegList}
    new_pos = {k:posList[k] for k in sortedPosList}
#     print("k filter")
#     print(len(new_pos))
#     print(len(new_neg))
    return new_pos,new_neg

In [174]:
def myTokenize(line, stop=True, negation=True):
    line = line.split(" ")
    stops = stopwords.words('english') + list(string.punctuation)
    if Stop:
        line = [word for word in line if word not in set(stops)]
    if Alpha:
        line = [word for word in line if word.isalpha()]
    if Negation:
        line = nltk.sentiment.util.mark_negation(line)
    return line

In [175]:
def getSortedKeyList(bag1,bag2):
    keyList = [*bag1] + list(set([*bag2]) - set([*bag1]))
    keyList = sorted(keyList)
    return keyList

In [176]:
def skPreprocessing(dataset):
    trainPos,trainNeg,testPos,testNeg = getPosNegReviews(dataset)
    pWbag = bagOfWordsSK(trainPos)
    nWbag = bagOfWordsSK(trainNeg)
    pos_keys, neg_keys = trimBags(pWbag, nWbag)
    keyList = getSortedKeyList(pWbag,nWbag)
    cv = CountVectorizer(input='filename', tokenizer=myTokenize, lowercase=True, vocabulary=keyList)
    trainFileNames = trainPos + trainNeg
    testFileNames = testPos + testNeg
    trainSet = cv.fit_transform(trainFileNames)
    testSet = cv.fit_transform(testFileNames)
    return trainSet, testSet

In [177]:
def logisticRegression(trainSet, testSet):
    LRclassifier = LogisticRegression()
    labels = [1]*800+[0]*800
    LRclassifier.fit(trainSet,labels)
    predictions = LRclassifier.predict(testSet)
    return predictions

In [178]:
def supportVectorMachine(trainSet, testSet):
    SVMclassifier = LinearSVC()
    labels = [1]*800+[0]*800
    SVMclassifier.fit(trainSet,labels)
    predictions = SVMclassifier.predict(testSet)
    return predictions

In [179]:
def decisionTree(trainSet, testSet):
    SVMclassifier = DecisionTreeClassifier()
    labels = [1]*800+[0]*800
    SVMclassifier.fit(trainSet,labels)
    predictions = SVMclassifier.predict(testSet)
    return predictions

In [180]:
def main(dataset, stemming, lowerCase,stop, negation, laplace, posPrior):
    trainSet, trainLabels, revSet, revLabels = loadDatasetNB(dataset)
    trainSetSK, testSetSK = skPreprocessing(dataset)

    predictedLabelsNB = naiveBayes(trainSet, trainLabels, revSet)
    predictedLabelsLR = logisticRegression(trainSetSK, testSetSK)
    predictedLabelsSVM = supportVectorMachine(trainSetSK, testSetSK)
    predictedLabelsDT = decisionTree(trainSetSK,testSetSK)
    
    accuracyNB, f1NB, precisionNB, recallNB = compute_accuraciesNB(predictedLabelsNB, revSet, revLabels)
    accuracyLR, f1LR, precisionLR, recallLR = compute_accuraciesSK(predictedLabelsLR)
    accuracySVM, f1SVM, precisionSVM, recallSVM = compute_accuraciesSK(predictedLabelsSVM)
    accuracyDT, f1DT, precisionDT, recallDT = compute_accuraciesSK(predictedLabelsDT)
    
    NBscores = accuracyNB, f1NB, precisionNB, recallNB
    LRscores = accuracyLR, f1LR, precisionLR, recallLR
    SVMscores = accuracySVM, f1SVM, precisionSVM, recallSVM
    DTscores = accuracyDT, f1DT, precisionDT, recallDT
#     print("Accuracy:",accuracy)
#     print("F1-Score:",f1)
#     print("Precision:",precision)
#     print("Recall:",recall)
    return NBscores, LRscores, SVMscores, DTscores

In [181]:
if __name__ == "__main__":
    dataset = "../TermProject/txt_sentoken"
    dataset = "../TermProject/txt_sentoken"
    Stemming = False
    Lowercase = True
    Stop = True
    Alpha = True
    Negation = True
    Unique = True
    Smoothing = True
    Prior = True
    laplace = 0.034
    posPrior = 0.8
    
    ##Naive bayes
    accuracyNB = []
    f1NB = []
    precisionNB = []
    recallNB = []
    
    #Logistic regression
    accuracyLR = []
    f1LR = []
    precisionLR = []
    recallLR = []
    
    #Support Vector Machine
    accuracySVM = []
    f1SVM = []
    precisionSVM = []
    recallSVM = []
    
    #Decision Tree
    accuracyDT = []
    f1DT = []
    precisionDT = []
    recallDT = []
    
    numberOfRuntimes = 3
    for i in range(numberOfRuntimes):
        NBscores, LRscores, SVMscores, DTscores = main(dataset, stemming, lowerCase,stop, negation, laplace, posPrior)
        accuracyNB.append(NBscores[0])
        f1NB.append(NBscores[1])
        precisionNB.append(NBscores[2])
        recallNB.append(NBscores[3])
        
        accuracyLR.append(LRscores[0])
        f1LR.append(LRscores[1])
        precisionLR.append(LRscores[2])
        recallLR.append(LRscores[3])
        
        accuracySVM.append(SVMscores[0])
        f1SVM.append(SVMscores[1])
        precisionSVM.append(SVMscores[2])
        recallSVM.append(SVMscores[3])
        
        accuracyDT.append(DTscores[0])
        f1DT.append(DTscores[1])
        precisionDT.append(DTscores[2])
        recallDT.append(DTscores[3])
        
#         print("RUN NUMBER " + str(i+1) + " ---------------")
#         print("Accuracy:",curaccuracy)
#         print("F1-Score:",curf1)
#         print("Precision:",curprecision)
#         print("Recall:",currecall)

    #RESULTS OF NAIVE BAYES (unigram) 
    aveAccuracy = np.mean(accuracyNB)
    avef1 = np.mean(f1NB)
    avePrecision = np.mean(precisionNB)
    aveRecall = np.mean(recallNB)
    stdAccuracy = np.std(accuracyNB)
    stdf1 = np.std(f1NB)
    stdPrecision = np.std(precisionNB)
    stdRecall = np.std(recallNB)
    print("Final results NAIVE BAYES----------------------------------")
    print("Average Accuracy:", aveAccuracy)
    print("Average F1:", avef1)
    print("Average Precision:", avePrecision)
    print("Average recall", aveRecall)
    print("STD Accuracy:", stdAccuracy)
    print("STD F1:", stdf1)
    print("STD Precision:", stdPrecision)
    print("STD Recall:", stdRecall)
    
    #RESULTS OF LOGISTIC REGRESSION
    aveAccuracy = np.mean(accuracyLR)
    avef1 = np.mean(f1LR)
    avePrecision = np.mean(precisionLR)
    aveRecall = np.mean(recallLR)
    stdAccuracy = np.std(accuracyLR)
    stdf1 = np.std(f1LR)
    stdPrecision = np.std(precisionLR)
    stdRecall = np.std(recallLR)
    print("Final results  LOGISTIC REGRESSION----------------------------------")
    print("Average Accuracy:", aveAccuracy)
    print("Average F1:", avef1)
    print("Average Precision:", avePrecision)
    print("Average recall", aveRecall)
    print("STD Accuracy:", stdAccuracy)
    print("STD F1:", stdf1)
    print("STD Precision:", stdPrecision)
    print("STD Recall:", stdRecall)
    
    #RESULTS OF SUPPORT VECTOR MACHINE
    aveAccuracy = np.mean(accuracySVM)
    avef1 = np.mean(f1SVM)
    avePrecision = np.mean(precisionSVM)
    aveRecall = np.mean(recallSVM)
    stdAccuracy = np.std(accuracySVM)
    stdf1 = np.std(f1SVM)
    stdPrecision = np.std(precisionSVM)
    stdRecall = np.std(recallSVM)
    print("Final results SUPPORT VECTOR MACHINE----------------------------------")
    print("Average Accuracy:", aveAccuracy)
    print("Average F1:", avef1)
    print("Average Precision:", avePrecision)
    print("Average recall", aveRecall)
    print("STD Accuracy:", stdAccuracy)
    print("STD F1:", stdf1)
    print("STD Precision:", stdPrecision)
    print("STD Recall:", stdRecall)
    
    #RESULTS OF DECISION TREE
    aveAccuracy = np.mean(accuracyDT)
    avef1 = np.mean(f1DT)
    avePrecision = np.mean(precisionDT)
    aveRecall = np.mean(recallDT)
    stdAccuracy = np.std(accuracyDT)
    stdf1 = np.std(f1DT)
    stdPrecision = np.std(precisionDT)
    stdRecall = np.std(recallDT)
    print("Final results DECISION TREE----------------------------------")
    print("Average Accuracy:", aveAccuracy)
    print("Average F1:", avef1)
    print("Average Precision:", avePrecision)
    print("Average recall", aveRecall)
    print("STD Accuracy:", stdAccuracy)
    print("STD F1:", stdf1)
    print("STD Precision:", stdPrecision)
    print("STD Recall:", stdRecall)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2881.26it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3213.78it/s]


prefilter
26026
24339
first filter
20379
13825
max_length
13825
sorted max filter
13825
13825


100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2506.52it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3029.22it/s]


prefilter
26180
24284
first filter
20644
13654
max_length
13654
sorted max filter
13654
13654


100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2645.58it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2156.17it/s]


prefilter
25822
24492
first filter
20279
13826
max_length
13826
sorted max filter
13826
13826




Final results NAIVE BAYES----------------------------------
Average Accuracy: 0.7616666666666667
Average F1: 0.7625950638581905
Average Precision: 0.7593664457699546
Average recall 0.7666666666666666
STD Accuracy: 0.02044640691064217
STD F1: 0.022316368582995467
STD Precision: 0.019887520316892126
STD Recall: 0.03472111109333279
Final results  LOGISTIC REGRESSION----------------------------------
Average Accuracy: 0.7341666666666667
Average F1: 0.735320295023977
Average Precision: 0.7325496405986812
Average recall 0.7383333333333333
STD Accuracy: 0.011606990230986759
STD F1: 0.009913399280773644
STD Precision: 0.015947254089400356
STD Recall: 0.010274023338281637
Final results SUPPORT VECTOR MACHINE----------------------------------
Average Accuracy: 0.6616666666666666
Average F1: 0.6650307973323138
Average Precision: 0.6588363570591612
Average recall 0.6716666666666667
STD Accuracy: 0.007168604389202217
STD F1: 0.0035884540390672404
STD Precision: 0.011393418836333226
STD Recall: 0.01