In [1]:
import sys
import time
import math
import copy
import random
import string
import glob
import re
import numpy as np
import nltk.sentiment.util
from tqdm import tqdm
from os import listdir
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.metrics import *



# nltk.download('stopwords')

In [2]:
def loadReviewNames(posDir, negDir):
    positive = glob.glob(posDir)
    negative = glob.glob(negDir)
#     print(positive)
#     print(negative)
    return positive, negative

In [3]:
#loads dataset in a way that will work with scikit
def getPosNegReviews(directory):
    positive, negative = loadReviewNames(directory + '/pos/*',directory + '/neg/*')
    random.shuffle(positive)
    boundaryTrain = math.floor(0.8 * len(positive))
    trainPos = positive[:boundaryTrain]
    random.shuffle(negative)
    trainNeg = negative[:boundaryTrain]
    
    testPos = positive[boundaryTrain:]
    testNeg = negative[boundaryTrain:]
    return trainPos,trainNeg,testPos,testNeg

In [4]:
def loadDir(name):
    # Loads the files in the folder and returns a list of lists of words from the text in each file
    tokenizer = RegexpTokenizer(r'\w+')
    data = []
    count = 0
    for f in tqdm(listdir(name)):
        fullname = name+f
        text = []
        with open(fullname, 'r') as f:
            for line in f:
                text += myTokenize(line)
        data.append(text)
        count = count + 1
    return data

In [5]:
#loads dataset in a way that will work with my unigram Naive bayes implementation
def loadDatasetNB(directory):
    positive = loadDir(directory + '/pos/')
    negative = loadDir(directory + '/neg/')
#     print("tokenize in load")
#     print(positive)
#     for review in positive:
#         review = myTokenize(review)
#     for review in negative:
#         review = myTokenize(review)
        
    random.shuffle(positive)
    boundaryTrain = math.floor(0.8 * len(positive))
    trainPos = positive[:boundaryTrain]
    random.shuffle(negative)
    trainNeg = negative[:boundaryTrain]
    combinedTrain = trainPos + trainNeg
    length = len(trainPos) + len(trainNeg)
    labelsTrain = len(trainNeg) * [1] + len(trainNeg) * [0]
    labelsTrain = np.array(labelsTrain)

    testPos = positive[boundaryTrain:]
    testNeg = negative[boundaryTrain:]
    combinedTest = testPos + testNeg
    labelsTest = len(testPos) * [1] + len(testNeg) * [0]
    labelsTest = np.array(labelsTest)
    return combinedTrain, labelsTrain, combinedTest, labelsTest

In [6]:
#helper function for unigram Naive bayes implementation
def bagOfWordsNB(train_set, train_labels):
    mydict = {}
    posV = 0
    negV = 0
    totalposwords = 0
    totalnegwords = 0

#   start = time.process_time()

    #create bag of words and number of occurences
    count = 0
    for x in train_set:
        rating = train_labels[count]
        count += 1
        if(rating):
            for y in x:
                if y not in mydict:
                    mydict[y] = [1,0] #default [1 pos, 0 neg]
                    posV += 1
                    totalposwords += 1
                else:
                    if mydict[y][0] == 0:
                        posV += 1
                    mydict[y][0] += 1
                    totalposwords += 1
        else:
            for y in x:
                if y not in mydict:
                    mydict[y] = [0,1] #default [0 pos, 1 neg]
                    negV += 1
                    totalnegwords += 1
                else:
                    if mydict[y][1] == 0:
                        negV += 1
                    mydict[y][1] += 1
                    totalnegwords += 1
#     print("review count is: ", count)
#     print("posV", posV)
#     print("negV", negV)
#     print("total word count is:", totalposwords + totalnegwords)
    BOW = mydict, posV, negV, totalposwords, totalnegwords
#     print(mydict)
    return BOW

In [7]:
def naiveBayes(train_set, train_labels, dev_set):
    #Baseline#
    # return predicted labels of development set
    # print("not even started yet")
    smoothing_parameter = laplace
    
#     start = time.process_time()

#     print("Going through train set took: ", time.process_time() - start)
    mydict, posV, negV, totalposwords, totalnegwords = bagOfWordsNB(train_set, train_labels)
#     print("mydict")
#     print(mydict)
#     print("posV")
#     print(posV)
#     print("negV")
#     print(negV)
#     print("totalposwords")
#     print(totalposwords)
#     print("totalnegwords")
#     print(totalnegwords)
    
    
    #come up with the bag of words unigram model
    probWordPos = {}
    probWordNeg = {}

    for x in mydict:
        #use laplace smoothing
        # count(W) + a / n + a * (V+1)
        # n = number of words in our training data
        # count(W) = number of times W appeared in training data
        # α is a tuning constant between 0 and 1 (typically small)
        # V = number of word TYPES seen in training data
        probWordPos[x] = math.log((mydict[x][0] + smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
        probWordNeg[x] = math.log((mydict[x][1] + smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))
#     start = time.process_time()

    # dev set
    predictions = []
    for x in range(len(dev_set)):
        chancePos = 0
        chanceNeg = 0 
        if(Prior):
            chancePos += math.log(posPrior)
            chanceNeg += math.log(1-posPrior)
        for y in range(len(dev_set[x])):
            if dev_set[x][y] in mydict:
                chancePos += probWordPos[dev_set[x][y]]
                chanceNeg += probWordNeg[dev_set[x][y]]
            # else:
                # chancePos += math.log((smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
                # chanceNeg += math.log((smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))
        if(chancePos > chanceNeg):
            predictions.append(1)
        else:
            predictions.append(0)
#     print("devset time took:", time.process_time() - start)
    return predictions

In [8]:
def compute_accuraciesNB(predictedLabels, dev_set, dev_labels):
    yhats = predictedLabels
    accuracy = np.mean(yhats == dev_labels)
    tp = np.sum([yhats[i] == dev_labels[i] and yhats[i] == 1 for i in range(len(yhats))])
    precision = tp / np.sum([yhats[i] == 1 for i in range(len(yhats))])
    recall = tp / (np.sum([yhats[i] != dev_labels[i] and yhats[i] == 0 for i in range(len(yhats))]) + tp)
    f1 = 2 * (precision * recall) / (precision + recall)
    return accuracy, f1, precision, recall

In [9]:
def compute_accuraciesSK(predictedLabels):
    dev_labels = [1]*200+[0]*200
    yhats = predictedLabels
    accuracy = np.mean(yhats == dev_labels)
    tp = np.sum([yhats[i] == dev_labels[i] and yhats[i] == 1 for i in range(len(yhats))])
    precision = tp / np.sum([yhats[i] == 1 for i in range(len(yhats))])
    recall = tp / (np.sum([yhats[i] != dev_labels[i] and yhats[i] == 0 for i in range(len(yhats))]) + tp)
    f1 = 2 * (precision * recall) / (precision + recall)
    return accuracy, f1, precision, recall

In [10]:
#### scikit pre processing methods ####

In [11]:
def bagOfWordsSK(reviews):
    wbag = {}
#     stops = stopwords.words('english') + list(string.punctuation)    -- global variable for speed
    for review in reviews:
        with open(review, 'r') as f:
            for line in f:
                text = myTokenize(line)
                for word in text:
                    wbag[word] = wbag.get(word, 0) + 1
    return wbag

In [12]:
def trimBags(pWbag, nWbag):
    posList = {}
    negList = {}
#     print("prefilter")
#     print(len(pWbag))
#     print(len(nWbag))
    #force words to only exist in either positive or negative bags
    if(Unique):
        for key in nWbag.keys():
            if key in pWbag.keys():
                posFreq = pWbag[key]
                negFreq = nWbag[key]
                if int(posFreq) >= int(negFreq):
                    posList[key] = pWbag[key]
                elif int(posFreq) < int(negFreq):
                    negList[key] = nWbag[key]
            else:
                negList[key] = nWbag[key]
        for key in pWbag.keys():
            if key not in nWbag.keys():
                posList[key] = pWbag[key]
    else:
        posList = pWbag
        negList = nWbag
#     print("first filter")
#     print(len(posList))
#     print(len(negList))
    #determine the boundary size
    max_length=min(len(negList),len(posList))
#     print("max_length")
#     print(max_length)
    #sort the keys of the map into a list, sorting to get most frequently used words
    sortedNegList = sorted(negList, key=negList.get, reverse=True)[:max_length]
    sortedPosList = sorted(posList, key=posList.get, reverse=True)[:max_length]
#     print("sorted max filter")
#     print(len(sortedNegList))
#     print(len(sortedPosList))
    #turn the sorted list into a usable mapping again
    new_neg= {k:negList[k] for k in sortedNegList}
    new_pos = {k:posList[k] for k in sortedPosList}
#     print("k filter")
#     print(len(new_pos))
#     print(len(new_neg))
    return new_pos,new_neg

In [13]:
def myTokenize(line):
    line = line.strip()
    line = line.split(" ")
#     res = [] 
#     for sub in line:
#         res.append(re.sub('\n', '', sub))
#     line = res
#     stops = stopwords.words('english') + list(string.punctuation)    -- global variable for speed
    if Alpha:
        line = [word for word in line if word.isalpha()]
#         print("Alpha")
#         print(line)
    if Stop:
        line = [word for word in line if word not in list(set(stops) - set(['not']))]
#         print("Stop")
#         print(line)
    if Stemming:
        porter_stemmer = PorterStemmer()
        for i in range(len(line)):
            line[i] = porter_stemmer.stem(line[i])
    if POS:
        tagSet = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'RP']
        tagged = nltk.pos_tag([i for i in line if i])
#         print("tagged")
#         print(tagged)
#         print("senetence before")
#         print(line)
        line = [w for w, tag, in zip(line,tagged) if ("JJ" in tag) or ("JJR" in tag) or ("JJS" in tag) or ("RB" in tag) or ("RBR" in tag) or ("RBS" in tag) or ("RP" in tag)] #adjective or adverd
#         print("senetence after")
#         print(line)
    if Negation:
        line = nltk.sentiment.util.mark_negation(line)
#         print("Negation")
#         print(line)
    return line

In [14]:
def getSortedKeyList(bag1,bag2):
    keyList = [*bag1] + list(set([*bag2]) - set([*bag1]))
    keyList = sorted(keyList)
    return keyList

In [15]:
def skPreprocessing(dataset):
    trainPos,trainNeg,testPos,testNeg = getPosNegReviews(dataset)
    pWbag = bagOfWordsSK(trainPos)
    nWbag = bagOfWordsSK(trainNeg)
    pos_keys, neg_keys = trimBags(pWbag, nWbag)
    keyList = getSortedKeyList(pWbag,nWbag)
    cv = CountVectorizer(input='filename', tokenizer=myTokenize, lowercase=True, vocabulary=keyList)
#     print(keyList)
    trainFileNames = trainPos + trainNeg
    testFileNames = testPos + testNeg
    trainSet = cv.fit_transform(trainFileNames)
    testSet = cv.fit_transform(testFileNames)
    return trainSet, testSet

In [16]:
def logisticRegression(trainSet, testSet):
    LRclassifier = LogisticRegression()
    labels = [1]*800+[0]*800
    LRclassifier.fit(trainSet,labels)
    predictions = LRclassifier.predict(testSet)
    return predictions

In [17]:
def supportVectorMachine(trainSet, testSet):
    SVMclassifier = LinearSVC()
    labels = [1]*800+[0]*800
    SVMclassifier.fit(trainSet,labels)
    predictions = SVMclassifier.predict(testSet)
    return predictions

In [18]:
def decisionTree(trainSet, testSet):
    SVMclassifier = DecisionTreeClassifier()
    labels = [1]*800+[0]*800
    SVMclassifier.fit(trainSet,labels)
    predictions = SVMclassifier.predict(testSet)
    return predictions

In [19]:
def main(dataset):
    trainSet, trainLabels, revSet, revLabels = loadDatasetNB(dataset)
    trainSetSK, testSetSK = skPreprocessing(dataset)
    

    predictedLabelsNB = naiveBayes(trainSet, trainLabels, revSet)
    predictedLabelsLR = logisticRegression(trainSetSK, testSetSK)
    predictedLabelsSVM = supportVectorMachine(trainSetSK, testSetSK)
    predictedLabelsDT = decisionTree(trainSetSK,testSetSK)
    
    accuracyNB, f1NB, precisionNB, recallNB = compute_accuraciesNB(predictedLabelsNB, revSet, revLabels)
    accuracyLR, f1LR, precisionLR, recallLR = compute_accuraciesSK(predictedLabelsLR)
    accuracySVM, f1SVM, precisionSVM, recallSVM = compute_accuraciesSK(predictedLabelsSVM)
    accuracyDT, f1DT, precisionDT, recallDT = compute_accuraciesSK(predictedLabelsDT)
    
    NBscores = accuracyNB, f1NB, precisionNB, recallNB
#     NBscores = [0,0,0,0]
    LRscores = accuracyLR, f1LR, precisionLR, recallLR
    SVMscores = accuracySVM, f1SVM, precisionSVM, recallSVM
    DTscores = accuracyDT, f1DT, precisionDT, recallDT
#     print("Accuracy:",accuracy)
#     print("F1-Score:",f1)
#     print("Precision:",precision)
#     print("Recall:",recall)
    return NBscores, LRscores, SVMscores, DTscores

In [20]:
if __name__ == "__main__":
    #base trial globals
    dataset = "../TermProject/txt_sentoken"
    stops = stopwords.words('english') + list(string.punctuation)
    Alpha = False
    Stop = False
    Stemming = False
    POS = False
    Negation = False
    Unique = False
    Prior = False
    laplace = 0.034
    posPrior = 0.8
    
    ##Naive bayes
    accuracyNB = []
    f1NB = []
    precisionNB = []
    recallNB = []
    
    #Logistic regression
    accuracyLR = []
    f1LR = []
    precisionLR = []
    recallLR = []
    
    #Support Vector Machine
    accuracySVM = []
    f1SVM = []
    precisionSVM = []
    recallSVM = []
    
    #Decision Tree
    accuracyDT = []
    f1DT = []
    precisionDT = []
    recallDT = []
    
    numberOfRuntimes = 1
#     with concurrent.futures.ProcessPoolExecutor() as executor:
    for i in range(numberOfRuntimes):
        NBscores, LRscores, SVMscores, DTscores = main(dataset)
#         NBscores = main(dataset)
        accuracyNB.append(NBscores[0])
        f1NB.append(NBscores[1])
        precisionNB.append(NBscores[2])
        recallNB.append(NBscores[3])

        accuracyLR.append(LRscores[0])
        f1LR.append(LRscores[1])
        precisionLR.append(LRscores[2])
        recallLR.append(LRscores[3])

        accuracySVM.append(SVMscores[0])
        f1SVM.append(SVMscores[1])
        precisionSVM.append(SVMscores[2])
        recallSVM.append(SVMscores[3])

        accuracyDT.append(DTscores[0])
        f1DT.append(DTscores[1])
        precisionDT.append(DTscores[2])
        recallDT.append(DTscores[3])
        

    
    #Printing the Variables of the current trial
#     Alpha = False
#     Stop = False
#     POS = False
#     Negation = False
#     Unique = False
#     Stemming = False
#     Prior = False
#     laplace = 0.034
#     posPrior = 0.8
#     numberOfRuntimes = 10
    print("Trial 1 - Baseline ----------------------------------------")
    print("Alpha---")
    print(Alpha)
    print("Stop---")
    print(Stop)
    print("Stemming-----")
    print(Stemming)
    print("POS-----")
    print(POS)
    print("Negation---")
    print(Negation)

    print("Unique-----")
    print(Unique)
    print("Prior---")
    print(Prior)
    print("number of runtimes:", numberOfRuntimes)
    #RESULTS OF NAIVE BAYES (unigram) 
    aveAccuracy = np.mean(accuracyNB)
    avef1 = np.mean(f1NB)
    avePrecision = np.mean(precisionNB)
    aveRecall = np.mean(recallNB)
    stdAccuracy = np.std(accuracyNB)
    stdf1 = np.std(f1NB)
    stdPrecision = np.std(precisionNB)
    stdRecall = np.std(recallNB)
    
    
    ########   Note:  There are a lot of commented out measurements of performance because I was going to incorporate all of
    ####### them into a single visualization intially. However, visualizing all of these variables was too messy,
    ####### and so instead I opted for simplying comparing average Accuracies.
    ########
    ########  Also note, I was prepared to run all feature combinations on all algorithms, but it was taking way too long, and 
    ######## crashing randomly during long runs. Instead, I opted for reducing the runs to 2 for the late features and also 
    ######## showing a baseline feature addition approach as mentioned in rubric 

    print("Final results NAIVE BAYES----------------------------------")
#     print("Accuracy:",accuracyNB)
#     print("F1-Score:",f1NB)
#     print("Precision:",precisionNB)
#     print("Recall:",recallNB)
    print("Average Accuracy:", aveAccuracy)
#     print("STD Accuracy:", stdAccuracy)
    print("Average F1:", avef1)
#     print("STD F1:", stdf1)
    print("Average Precision:", avePrecision)
#     print("STD Precision:", stdPrecision)
    print("Average recall", aveRecall)
#     print("STD Recall:", stdRecall)
    
#     #RESULTS OF LOGISTIC REGRESSION
    aveAccuracy = np.mean(accuracyLR)
    avef1 = np.mean(f1LR)
#     avePrecision = np.mean(precisionLR)
#     aveRecall = np.mean(recallLR)
#     stdAccuracy = np.std(accuracyLR)
#     stdf1 = np.std(f1LR)
#     stdPrecision = np.std(precisionLR)
#     stdRecall = np.std(recallLR)
    print("Final results  LOGISTIC REGRESSION----------------------------------")
#     print("Accuracy:",accuracyLR)
#     print("F1-Score:",f1LR)
#     print("Precision:",precisionLR)
#     print("Recall:",recallLR)
    print("Average Accuracy:", aveAccuracy)
#     print("STD Accuracy:", stdAccuracy)
    print("Average F1:", avef1)
#     print("STD F1:", stdf1)
#     print("Average Precision:", avePrecision)
#     print("STD Precision:", stdPrecision)
#     print("Average recall", aveRecall)
#     print("STD Recall:", stdRecall)
    
#     #RESULTS OF SUPPORT VECTOR MACHINE
    aveAccuracy = np.mean(accuracySVM)
    avef1 = np.mean(f1SVM)
#     avePrecision = np.mean(precisionSVM)
#     aveRecall = np.mean(recallSVM)
#     stdAccuracy = np.std(accuracySVM)
#     stdf1 = np.std(f1SVM)
#     stdPrecision = np.std(precisionSVM)
#     stdRecall = np.std(recallSVM)
    print("Final results SUPPORT VECTOR MACHINE----------------------------------")
#     print("Accuracy:",accuracySVM)
#     print("F1-Score:",f1SVM)
#     print("Precision:",precisionSVM)
#     print("Recall:",recallSVM)
    print("Average Accuracy:", aveAccuracy)
#     print("STD Accuracy:", stdAccuracy)
    print("Average F1:", avef1)
#     print("STD F1:", stdf1)
#     print("Average Precision:", avePrecision)
#     print("STD Precision:", stdPrecision)
#     print("Average recall", aveRecall)
#     print("STD Recall:", stdRecall)


    

    
#     #RESULTS OF DECISION TREE
    aveAccuracy = np.mean(accuracyDT)
    avef1 = np.mean(f1DT)
#     avePrecision = np.mean(precisionDT)
#     aveRecall = np.mean(recallDT)
#     stdAccuracy = np.std(accuracyDT)
#     stdf1 = np.std(f1DT)
#     stdPrecision = np.std(precisionDT)
#     stdRecall = np.std(recallDT)
    print("Final results DECISION TREE----------------------------------")
#     print("Accuracy:",accuracyDT)
#     print("F1-Score:",f1DT)
#     print("Precision:",precisionDT)
#     print("Recall:",recallDT)
    print("Average Accuracy:", aveAccuracy)
#     print("STD Accuracy:", stdAccuracy)
    print("Average F1:", avef1)
#     print("STD F1:", stdf1)
#     print("Average Precision:", avePrecision)
#     print("STD Precision:", stdPrecision)
#     print("Average recall", aveRecall)
#     print("STD Recall:", stdRecall)

allDone()

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3203.45it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3066.29it/s]


Trial 1 - Baseline ----------------------------------------
Alpha---
False
Stop---
False
Stemming-----
False
POS-----
False
Negation---
False
Unique-----
False
Prior---
False
number of runtimes: 1
Final results NAIVE BAYES----------------------------------
Average Accuracy: 0.735
Average F1: 0.7376237623762375
Average Precision: 0.7303921568627451
Average recall 0.745
Final results  LOGISTIC REGRESSION----------------------------------
Average Accuracy: 0.8275
Average F1: 0.8253164556962025
Final results SUPPORT VECTOR MACHINE----------------------------------
Average Accuracy: 0.8275
Average F1: 0.8261964735516373
Final results DECISION TREE----------------------------------
Average Accuracy: 0.7025
Average F1: 0.7076167076167076


NameError: name 'allDone' is not defined

In [66]:
# Trial 1 - Baseline ----------------------------------------
#     Alpha = False
#     Stop = False
#     POS = False
#     Negation = False
#     Unique = False
#     Stemming = False
#     Prior = False
#     laplace = 0.034
#     posPrior = 0.8
# number of runtimes = 10
# Final results NAIVE BAYES----------------------------------
# Average Accuracy: 0.79875
# Final results  LOGISTIC REGRESSION----------------------------------
# Average Accuracy: 0.8355
# Final results SUPPORT VECTOR MACHINE----------------------------------
# Average Accuracy: 0.8265
# Final results DECISION TREE----------------------------------
# Average Accuracy: 0.63975

In [22]:
# Trial 2 - Baseline + Alpha ----------------------------------------
# Alpha---
# True
# Stop---
# False
# Stemming-----
# False
# Negation---
# False
# POS-----
# False
# Unique-----
# False
# Prior---
# False
# number of runtimes = 10
# Final results NAIVE BAYES----------------------------------
# Average Accuracy: 0.79025
# Final results  LOGISTIC REGRESSION----------------------------------
# Average Accuracy: 0.8355
# Final results SUPPORT VECTOR MACHINE----------------------------------
# Average Accuracy: 0.82575
# Final results DECISION TREE----------------------------------
# Average Accuracy: 0.625

In [None]:
# Trial 3 - Baseline + Alpha + Stop ----------------------------------------
# Alpha---
# True
# Stop---
# True
# Stemming-----
# False
# Negation---
# False
# POS-----
# False
# Unique-----
# False
# Prior---
# False
# number of runtimes = 10
# Final results NAIVE BAYES----------------------------------
# Average Accuracy: 0.7862500000000001
# Final results  LOGISTIC REGRESSION----------------------------------
# Average Accuracy: 0.8404999999999999
# Final results SUPPORT VECTOR MACHINE----------------------------------
# Average Accuracy: 0.8342499999999999
# Final results DECISION TREE----------------------------------
# Average Accuracy: 0.6275

In [None]:
# Trial 3 - Baseline + Alpha + Stop ----------------------------------------
# Alpha---
# True
# Stop---
# True
# Stemming-----
# False
# Negation---
# False
# POS-----
# False
# Unique-----
# False
# Prior---
# False
# number of runtimes = 10
# Final results NAIVE BAYES----------------------------------
# Average Accuracy: 0.7969999999999999
# Average F1: 0.796478316397207
# Final results  LOGISTIC REGRESSION----------------------------------
# Average Accuracy: 0.8314999999999999
# Average F1: 0.8320190563443252
# Final results SUPPORT VECTOR MACHINE----------------------------------
# Average Accuracy: 0.8245000000000001
# Average F1: 0.8251299491764266
# Final results DECISION TREE----------------------------------
# Average Accuracy: 0.63475
# Average F1: 0.6350275636427405

In [None]:
# Trial 4 - Baseline + Alpha + Stop + Stemming ----------------------------------------
# Alpha---
# True
# Stop---
# True
# Stemming-----
# True
# Negation---
# False
# POS-----
# False
# Unique-----
# False
# Prior---
# False
# number of runtimes = 10
# Final results NAIVE BAYES----------------------------------
# Average Accuracy: 0.79325
# Average F1: 0.7903109835488404
# Final results  LOGISTIC REGRESSION----------------------------------
# Average Accuracy: 0.83825
# Average F1: 0.8377428690324334
# Final results SUPPORT VECTOR MACHINE----------------------------------
# Average Accuracy: 0.82175
# Average F1: 0.8192189591136172
# Final results DECISION TREE----------------------------------
# Average Accuracy: 0.6322500000000001
# Average F1: 0.6340694553210997

In [43]:
# Trial 5 - Baseline ----------------------------------------
# Alpha---
# False
# Stop---
# False
# Stemming-----
# False
# Negation---
# False
# POS-----
# False
# Unique-----
# False
# Prior---
# False
# number of runtimes = 10
# Final results NAIVE BAYES----------------------------------
# Average Accuracy: 0.8082499999999999
# Average F1: 0.8070739683677216
# Average Precision: 0.8119863442980957
# Average recall 0.8025

In [None]:
# Trial 6 - Baseline + Alpha ----------------------------------------
# Alpha---
# True
# Stop---
# True
# Stemming-----
# False
# Negation---
# False
# POS-----
# False
# Unique-----
# False
# Prior---
# False
# number of runtimes = 10
# Final results NAIVE BAYES----------------------------------
# Average Accuracy: 0.79925
# Average F1: 0.796046707
# Average Precision: 0.809030569
# Average recall 0.784

In [None]:
# Trial 7 - Baseline + Alpha + Stop ----------------------------------------
# Alpha---
# True
# Stop---
# True
# Stemming-----
# False
# Negation---
# False
# POS-----
# False
# Unique-----
# False
# Prior---
# False
# number of runtimes = 10
# Final results NAIVE BAYES----------------------------------
# Average Accuracy: 0.7895
# Average F1: 0.7893473207800615
# Average Precision: 0.7900923986942613
# Average recall 0.7889999999999999

In [None]:
# Trial 8 - Baseline + Alpha + Stop Stemming----------------------------------------
# Alpha---
# True
# Stop---
# True
# Stemming-----
# True
# Negation---
# False
# POS-----
# False
# Unique-----
# False
# Prior---
# False
# number of runtimes = 10
# Final results NAIVE BAYES----------------------------------
# Average Accuracy: 0.784
# Average F1: 0.7826437622790621
# Average Precision: 0.7888889830524213
# Average recall 0.7769999999999999

In [None]:
# Trial 9 - Baseline Alpha Stop Stemming POS----------------------------------------
# Alpha---
# True
# Stop---
# True
# Stemming-----
# True
# POS-----
# True
# Negation---
# False
# Unique-----
# False
# Prior---
# False
# number of runtimes = 10
# Final results NAIVE BAYES----------------------------------
# Average Accuracy: 0.7475
# Average F1: 0.7574037683504797
# Average Precision: 0.7297940797940798
# Average recall 0.7875000000000001
# Final results  LOGISTIC REGRESSION----------------------------------
# Average Accuracy: 0.7475
# Average F1: 0.7574037683504797
# Final results SUPPORT VECTOR MACHINE----------------------------------
# Average Accuracy: 0.7475
# Average F1: 0.7574037683504797
# Final results DECISION TREE----------------------------------
# Average Accuracy: 0.7475
# Average F1: 0.7574037683504797

In [69]:
# Trial 10 - Baseline Alpha Stop Stemming POS negation ----------------------------------------
# Alpha---
# True
# Stop---
# True
# Stemming-----
# True
# POS-----
# True
# Negation---
# True
# Unique-----
# False
# Prior---
# False
# number of runtimes: 2
# Final results NAIVE BAYES----------------------------------
# Average Accuracy: 0.74125
# Average F1: 0.739082804
# Average Precision: 0.744076986
# Average recall 0.735
# Final results  LOGISTIC REGRESSION----------------------------------
# Average Accuracy: 0.76625
# Average F1: 0.763604398414525
# Final results SUPPORT VECTOR MACHINE----------------------------------
# Average Accuracy: 0.7262500000000001
# Average F1: 0.7246730888514753
# Final results DECISION TREE----------------------------------
# Average Accuracy: 0.6275
# Average F1: 0.6269543464665417

In [70]:
#Forgot to print 11, which was all -Prior

In [None]:
# Trial 12 - Baseline ALL----------------------------------------
# Alpha---
# True
# Stop---
# True
# Stemming-----
# True
# POS-----
# True
# Negation---
# True
# Unique-----
# True
# Prior---
# True
# number of runtimes: 2
# Final results NAIVE BAYES----------------------------------
# Average Accuracy: 0.73
# Average F1: 0.746473285134897
# Average Precision: 0.7035557644110275
# Average recall 0.795
# Final results  LOGISTIC REGRESSION----------------------------------
# Average Accuracy: 0.72625
# Average F1: 0.7301123923514816
# Final results SUPPORT VECTOR MACHINE----------------------------------
# Average Accuracy: 0.71875
# Average F1: 0.7219621313058465
# Final results DECISION TREE----------------------------------
# Average Accuracy: 0.6287499999999999
# Average F1: 0.6319513543394141