In [66]:
import sys
import time
import math
import copy
import random
import string
import glob
import numpy as np
import nltk.sentiment.util
from tqdm import tqdm
from os import listdir
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


# nltk.download('stopwords')

In [67]:
def loadDataset(directory, stemming, lower_case, stop, negation):
    positive, negative = loadReviewNames(directory + '/pos/*',directory + '/neg/*')
    random.shuffle(positive)
    boundaryTrain = math.floor(0.8 * len(positive))
    trainPos = positive[:boundaryTrain]
    random.shuffle(negative)
    trainNeg = negative[:boundaryTrain]
    combinedTrain = trainPos + trainNeg
    length = len(trainPos) + len(trainNeg)
    labelsTrain = boundaryTrain * [1] + boundaryTrain * [0]
    labelsTrain = np.array(labelsTrain)

    testPos = positive[boundaryTrain:]
    testNeg = negative[boundaryTrain:]
    combinedTest = testPos + testNeg
    labelsTest = boundaryTrain * [1] + boundaryTrain * [0]
    labelsTest = np.array(labelsTest)
    return combinedTrain, labelsTrain, combinedTest, labelsTest, boundaryTrain

In [68]:
def loadReviewNames(posDir, negDir):
    positive = glob.glob(posDir)
    negative = glob.glob(negDir)
    print(positive)
    print(negative)
    return positive, negative

In [69]:
def loadDir(name,stemming,lower_case, stop, negation):
    # Loads the files in the folder and returns a list of lists of words from the text in each file
    if stemming:
        porter_stemmer = PorterStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    data = []
    count = 0
    for f in tqdm(listdir(name)):
        fullname = name+f
        text = []
        with open(fullname, 'rb') as f:
            for line in f:
                if lower_case:
                    line = line.decode(errors='ignore').lower()
                    text += tokenizer.tokenize(line)
                else:
                    text += tokenizer.tokenize(line.decode(errors='ignore'))
        if stemming:
            for i in range(len(text)):
                text[i] = porter_stemmer.stem(text[i])
        if stop:
            stops = stopwords.words('english') + list(string.punctuation)
            text = [word for word in text if word not in set(stops)]
#         if negation:
#             text = nltk.sentiment.util.mark_negation(line)
        data.append(text)
        count = count + 1
    return data

In [70]:
def generate_svm_featureset(neg_bow,pos_bow):
    review_word_index = []
    for word in neg_bow.keys():
        review_word_index.append(word)
    for word in pos_bow.keys():
        review_word_index.append(word)
    review_word_index = sorted(list(set(review_word_index)))
    return review_word_index

In [71]:
def bagOfWords(train_set, train_labels):
    stops = stopwords.words('english') + list(string.punctuation)
    count = 0
    mydictPos = {}
    mydictNeg = {}
    posV = 0
    negV = 0
    totalposwords = 0
    totalnegwords = 0
    for x in train_set:
        rating = train_labels[count]
        count += 1
        if(rating):
            for w in x:
                if (w not in mydictPos) and (w not in mydictNeg) and (w not in stops):
                    mydictPos[w] = 1
                    mydictNeg[w] = 0
                    posV += 1
                    totalposwords += 1
                elif w not in mydictPos and w not in stops:
                    mydictPos[w] = 1
                    posV += 1
                    totalposwords += 1
                elif w not in stops:
                    if mydictPos[w] == 0:
                        posV += 1
                    mydictPos[w] += 1
                    totalposwords += 1
        else:
            for w in x:
                if (w not in mydictPos) and (w not in mydictNeg) and (w not in stops):
                    mydictPos[w] = 0
                    mydictNeg[w] = 1
                    posV += 1
                    totalposwords += 1
                elif w not in mydictNeg and w not in stops:
                    mydictNeg[w] = 1
                    negV += 1
                    totalnegwords += 1
                elif w not in stops:
                    if mydictNeg[w] == 0:
                        negV += 1
                    mydictNeg[w] += 1
                    totalnegwords += 1
#     print("review count is: ", count)
#     print("posV", posV)
#     print("negV", negV)
#     print("total word count is:", totalposwords + totalnegwords)
    BOW = mydictPos,mydictNeg, posV, negV, totalposwords, totalnegwords
    return BOW

In [72]:
def naiveBayes(train_set, train_labels, dev_set, smoothing_parameter, pos_prior):
    #Baseline#
    # return predicted labels of development set
    # print("not even started yet")
    smoothing_parameter = 0.034

#     start = time.process_time()

#     print("Going through train set took: ", time.process_time() - start)
    mydictPos,mydictNeg, posV, negV, totalposwords, totalnegwords = bagOfWords(train_set, train_labels)
    
    
    #come up with the bag of words unigram model
    probWordPos = {}
    probWordNeg = {}

    #mydictNeg and mydictPos contain the same words as per bagofwords method
    #therefore, I use mydictPos as an easy way to iterate through all words
    for x in mydictPos:
        #use laplace smoothing
        # count(W) + a / n + a * (V+1)
        # n = number of words in our training data
        # count(W) = number of times W appeared in training data
        # α is a tuning constant between 0 and 1 (typically small)
        # V = number of word TYPES seen in training data

        probWordPos[x] = math.log((mydictPos[x] + smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
        probWordNeg[x] = math.log((mydictNeg[x] + smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))

    start = time.process_time()

    # dev set
    predictions = []
    for x in range(len(dev_set)):
        chancePos = math.log(pos_prior)
        chanceNeg = math.log(1-pos_prior)
        for y in range(len(dev_set[x])):
            if dev_set[x][y] in mydictPos:
                chancePos += probWordPos[dev_set[x][y]]
                chanceNeg += probWordNeg[dev_set[x][y]]
            # else:
                # chancePos += math.log((smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
                # chanceNeg += math.log((smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))
        if(chancePos > chanceNeg):
            predictions.append(1)
        else:
            predictions.append(0)
#     print("devset time took:", time.process_time() - start)
    return predictions

In [73]:
def logisticRegression(train_set, train_labels, dev_set):
    stops = stopwords.words('english') + list(string.punctuation)
    mydictPos,mydictNeg, posV, negV, totalposwords, totalnegwords = bagOfWords(train_set, train_labels)
    vocab = generate_svm_featureset(mydictPos,mydictNeg)
    #consider using strip_accents
    cv = CountVectorizer(input='filename', tokenizer=myTokenize, lowercase=True)
    LRclassifier = LogisticRegression()
    for x in range(len(train_set)//100):
        print(train_set[x])
        time.sleep(0.1)
    print("hello")
#     for i in range(len(train_set)):
#         print(len(train_set[i]))
#     trainSet = cv.fit_transform(train_set)
#     for i in range(len(train_labels)):
#         labelSet += (fillTrainset(train_set[i],train_labels[i]))
#         trainSet += train_set[i]
#     print(labelSet)
#     print(train_set)
    print("train_set")
    print(print(type(train_set)))
#     print(train_set[0])
    labels = [0]*750+[1]*750
    LRclassifier.fit(train_set,train_labels)
    predictions = LRclassifier.predict(dev_set)
    
    return predictions

In [74]:
def supportVectorMachine(train_set, train_labels, dev_set):
    SVMclassifier = LinearSVC()
    SVMclassifier.fit(train_set,train_labels)
    predictions = SVMclassifier.predict(dev_set)
    return predictions

In [75]:
def decisionTree(train_set, train_labels, dev_set):
    DTclassifier = DecisionTreeClassifier()
    DTclassifier.fit(train_set,train_labels)
    predictions = DTclassifier.predict(dev_set)
    return predictions

In [76]:
def compute_accuracies(predictedLabels, dev_set, dev_labels):
    yhats = predictedLabels
    accuracy = np.mean(yhats == dev_labels)
    tp = np.sum([yhats[i] == dev_labels[i] and yhats[i] == 1 for i in range(len(yhats))])
    precision = tp / np.sum([yhats[i] == 1 for i in range(len(yhats))])
    recall = tp / (np.sum([yhats[i] != dev_labels[i] and yhats[i] == 0 for i in range(len(yhats))]) + tp)
    f1 = 2 * (precision * recall) / (precision + recall)
    return accuracy, f1, precision, recall

In [77]:
def splitData(train_set, train_labels, dev_set, dev_labels):
    labelSet = []
    trainSet = []
    dataSet = []
    dataLabels = []
    for i in range(len(train_labels)):
        labelSet += (fillTrainset(train_set[i],train_labels[i]))
        trainSet += train_set[i]
    dataSet = train_set + dev_set
    dataLabels = train_labels + dev_labels
    trainSet, trainLabels, revSet, revLabels = train_test_split(dataSet, dataLabels, test_size=0.2)
    return trainSet, trainLabels, revSet, revLabels

In [78]:
def myTokenize(line, stop=True, negation=True):
    print("new line")
    print(line)
    print(" ")
    print(" ")
    print(" ")
    print(" ")
    line = line.split(" ")
    stops = stopwords.words('english') + list(string.punctuation)
    if stop:
        line = [word for word in line if word not in set(stops)]
#     if negation:
#         line = nltk.sentiment.util.mark_negation(line)
    return line

In [79]:
def fillTrainset(review, label):
    labelSet = []
    if label == 0:
        labelSet = [0]*len(review)
    if label == 1:
        labelSet = [1]*len(review)
    return labelSet

In [80]:
def main(dataset, stemming, lowerCase,stop, negation, laplace, posPrior):
    trainSet, trainLabels, revSet, revLabels, boundary = loadDataset(dataset, stemming, lowerCase,stop,negation)
    mydictPos,mydictNeg, posV, negV, totalposwords, totalnegwords = bagOfWords(trainSet, trainLabels)
#     print(mydict)
    print(len(trainSet))
    print(len(trainLabels))
#     print(trainSet[0])
#     print(trainSet[1])
#     print(trainLabels[0])
#     print(trainLabels[1])
#     trainSet, trainLabels, revSet, revLabels = splitData(trainSet, trainLabels, revSet, revLabels)

    predictedLabelsNB = naiveBayes(trainSet, trainLabels, revSet, laplace, posPrior)
    predictedLabelsLR = logisticRegression(trainSet, trainLabels, revSet)
    predictedLabelsSVM = supportVectorMachine(trainSet, trainLabels, revSet)
    predictedLabelsDT = decisionTree(trainSet, trainLabels, revSet)
    
    accuracyNB, f1NB, precisionNB, recallNB = compute_accuracies(predictedLabelsNB, revSet, revLabels)
    accuracyLR, f1LR, precisionLR, recallLR = compute_accuracies(predictedLabelsLR, revSet, revLabels)
    accuracySVM, f1SVM, precisionSVM, recallSVM = compute_accuracies(predictedLabelsSVM, revSet, revLabels)
    accuracyDT, f1DT, precisionDT, recallDT = compute_accuracies(predictedLabelsDT, revSet, revLabels)
    
    NBscores = accuracyNB, f1NB, precisionNB, recallNB
    LRscores = accuracyLR, f1LR, precisionLR, recallLR
    SVMscores = accuracySVM, f1SVM, precisionSVM, recallSVM
    DTscores = accuracyDT, f1DT, precisionDT, recallDT
#     print("Accuracy:",accuracy)
#     print("F1-Score:",f1)
#     print("Precision:",precision)
#     print("Recall:",recall)
    return NBscores, LRscores, SVMscores, DTscores

In [81]:
if __name__ == "__main__":
    dataset = "../TermProject/txt_sentoken"
    stemming = False
    lowerCase = True
    stop = True
    negation = True
    laplace = 1.0
    posPrior = 0.8
    
    ##Naive bayes
    accuracyNB = []
    f1NB = []
    precisionNB = []
    recallNB = []
    
    #Logistic regression
    accuracyLR = []
    f1LR = []
    precisionLR = []
    recallLR = []
    
    #Support Vector Machine
    accuracySVM = []
    f1SVM = []
    precisionSVM = []
    recallSVM = []
    
    #Decision Tree
    accuracyDT = []
    f1DT = []
    precisionDT = []
    recallDT = []
    
    numberOfRuntimes = 5
    for i in range(numberOfRuntimes):
        NBscores, LRscores, SVMscores = main(dataset, stemming, lowerCase,stop, negation, laplace, posPrior)
        accuracyNB.append(NBscores[0])
        f1NB.append(NBscores[1])
        precisionNB.append(NBscores[2])
        recallNB.append(NBscores[3])
        
        accuracyLR.append(LRscores[0])
        f1LR.append(LRscores[1])
        precisionLR.append(LRscores[2])
        recallLR.append(LRscores[3])
        
        accuracySVM.append(SVMscores[0])
        f1SVM.append(SVMscores[1])
        precisionSVM.append(SVMscores[2])
        recallSVM.append(SVMscores[3])
        
        accuracyDT.append(DTscores[0])
        f1DT.append(DTscores[1])
        precisionDT.append(DTscores[2])
        recallDT.append(DTscores[3])
        
#         print("RUN NUMBER " + str(i+1) + " ---------------")
#         print("Accuracy:",curaccuracy)
#         print("F1-Score:",curf1)
#         print("Precision:",curprecision)
#         print("Recall:",currecall)

    #RESULTS OF NAIVE BAYES (unigram) 
    aveAccuracy = np.mean(accuracyNB)
    avef1 = np.mean(f1NB)
    avePrecision = np.mean(precisionNB)
    aveRecall = np.mean(recallNB)
    stdAccuracy = np.std(accuracyNB)
    stdf1 = np.std(f1NB)
    stdPrecision = np.std(precisionNB)
    stdRecall = np.std(recallNB)
    print("Final results NAIVE BAYES----------------------------------")
    print("Average Accuracy:", aveAccuracy)
    print("Average F1:", avef1)
    print("Average Precision:", avePrecision)
    print("Average recall", aveRecall)
    print("STD Accuracy:", stdAccuracy)
    print("STD F1:", stdf1)
    print("STD Precision:", stdPrecision)
    print("STD Recall:", stdRecall)
    
    #RESULTS OF LOGISTIC REGRESSION
    aveAccuracy = np.mean(accuracyLR)
    avef1 = np.mean(f1LR)
    avePrecision = np.mean(precisionLR)
    aveRecall = np.mean(recallLR)
    stdAccuracy = np.std(accuracyLR)
    stdf1 = np.std(f1LR)
    stdPrecision = np.std(precisionLR)
    stdRecall = np.std(recallLR)
    print("Final results  LOGISTIC REGRESSION----------------------------------")
    print("Average Accuracy:", aveAccuracy)
    print("Average F1:", avef1)
    print("Average Precision:", avePrecision)
    print("Average recall", aveRecall)
    print("STD Accuracy:", stdAccuracy)
    print("STD F1:", stdf1)
    print("STD Precision:", stdPrecision)
    print("STD Recall:", stdRecall)
    
    #RESULTS OF SUPPORT VECTOR MACHINE
    aveAccuracy = np.mean(accuracySVM)
    avef1 = np.mean(f1SVM)
    avePrecision = np.mean(precisionSVM)
    aveRecall = np.mean(recallSVM)
    stdAccuracy = np.std(accuracySVM)
    stdf1 = np.std(f1SVM)
    stdPrecision = np.std(precisionSVM)
    stdRecall = np.std(recallSVM)
    print("Final results SUPPORT VECTOR MACHINE----------------------------------")
    print("Average Accuracy:", aveAccuracy)
    print("Average F1:", avef1)
    print("Average Precision:", avePrecision)
    print("Average recall", aveRecall)
    print("STD Accuracy:", stdAccuracy)
    print("STD F1:", stdf1)
    print("STD Precision:", stdPrecision)
    print("STD Recall:", stdRecall)
    
    #RESULTS OF DECISION TREE
    aveAccuracy = np.mean(accuracyDT)
    avef1 = np.mean(f1DT)
    avePrecision = np.mean(precisionDT)
    aveRecall = np.mean(recallDT)
    stdAccuracy = np.std(accuracyDT)
    stdf1 = np.std(f1DT)
    stdPrecision = np.std(precisionDT)
    stdRecall = np.std(recallDT)
    print("Final results DECISION TREE----------------------------------")
    print("Average Accuracy:", aveAccuracy)
    print("Average F1:", avef1)
    print("Average Precision:", avePrecision)
    print("Average recall", aveRecall)
    print("STD Accuracy:", stdAccuracy)
    print("STD F1:", stdf1)
    print("STD Precision:", stdPrecision)
    print("STD Recall:", stdRecall)

['../TermProject/txt_sentoken/pos\\cv000_29590.txt', '../TermProject/txt_sentoken/pos\\cv001_18431.txt', '../TermProject/txt_sentoken/pos\\cv002_15918.txt', '../TermProject/txt_sentoken/pos\\cv003_11664.txt', '../TermProject/txt_sentoken/pos\\cv004_11636.txt', '../TermProject/txt_sentoken/pos\\cv005_29443.txt', '../TermProject/txt_sentoken/pos\\cv006_15448.txt', '../TermProject/txt_sentoken/pos\\cv007_4968.txt', '../TermProject/txt_sentoken/pos\\cv008_29435.txt', '../TermProject/txt_sentoken/pos\\cv009_29592.txt', '../TermProject/txt_sentoken/pos\\cv010_29198.txt', '../TermProject/txt_sentoken/pos\\cv011_12166.txt', '../TermProject/txt_sentoken/pos\\cv012_29576.txt', '../TermProject/txt_sentoken/pos\\cv013_10159.txt', '../TermProject/txt_sentoken/pos\\cv014_13924.txt', '../TermProject/txt_sentoken/pos\\cv015_29439.txt', '../TermProject/txt_sentoken/pos\\cv016_4659.txt', '../TermProject/txt_sentoken/pos\\cv017_22464.txt', '../TermProject/txt_sentoken/pos\\cv018_20137.txt', '../TermProje




['../TermProject/txt_sentoken/neg\\cv000_29416.txt', '../TermProject/txt_sentoken/neg\\cv001_19502.txt', '../TermProject/txt_sentoken/neg\\cv002_17424.txt', '../TermProject/txt_sentoken/neg\\cv003_12683.txt', '../TermProject/txt_sentoken/neg\\cv004_12641.txt', '../TermProject/txt_sentoken/neg\\cv005_29357.txt', '../TermProject/txt_sentoken/neg\\cv006_17022.txt', '../TermProject/txt_sentoken/neg\\cv007_4992.txt', '../TermProject/txt_sentoken/neg\\cv008_29326.txt', '../TermProject/txt_sentoken/neg\\cv009_29417.txt', '../TermProject/txt_sentoken/neg\\cv010_29063.txt', '../TermProject/txt_sentoken/neg\\cv011_13044.txt', '../TermProject/txt_sentoken/neg\\cv012_29411.txt', '../TermProject/txt_sentoken/neg\\cv013_10494.txt', '../TermProject/txt_sentoken/neg\\cv014_15600.txt', '../TermProject/txt_sentoken/neg\\cv015_29356.txt', '../TermProject/txt_sentoken/neg\\cv016_4348.txt', '../TermProject/txt_sentoken/neg\\cv017_23487.txt', '../TermProject/txt_sentoken/neg\\cv018_21672.txt', '../TermProje

1600
1600
../TermProject/txt_sentoken/pos\cv748_12786.txt
../TermProject/txt_sentoken/pos\cv664_4389.txt
../TermProject/txt_sentoken/pos\cv768_11751.txt
../TermProject/txt_sentoken/pos\cv260_13959.txt
../TermProject/txt_sentoken/pos\cv686_13900.txt
../TermProject/txt_sentoken/pos\cv245_8569.txt
../TermProject/txt_sentoken/pos\cv508_16006.txt
../TermProject/txt_sentoken/pos\cv535_19728.txt
../TermProject/txt_sentoken/pos\cv425_8250.txt
../TermProject/txt_sentoken/pos\cv156_10481.txt
../TermProject/txt_sentoken/pos\cv800_12368.txt
../TermProject/txt_sentoken/pos\cv706_24716.txt
../TermProject/txt_sentoken/pos\cv157_29372.txt
../TermProject/txt_sentoken/pos\cv460_10842.txt
../TermProject/txt_sentoken/pos\cv976_10267.txt
../TermProject/txt_sentoken/pos\cv500_10251.txt
hello
train_set
<class 'list'>
None




ValueError: could not convert string to float: '../TermProject/txt_sentoken/pos\\cv748_12786.txt'