In [13]:
import sys
import time
import math
import copy
import random
import string
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tqdm import tqdm
from os import listdir



# nltk.download('stopwords')
# bad_words = {'aed','oed','eed'} # these words fail in nltk stemmer algorithm

In [14]:
def loadDir(name,stemming,lower_case):
    # Loads the files in the folder and returns a list of lists of words from the text in each file
    if stemming:
        porter_stemmer = PorterStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    X0 = []
    count = 0
    for f in tqdm(listdir(name)):
        fullname = name+f
        text = []
        with open(fullname, 'rb') as f:
            for line in f:
                if lower_case:
                    line = line.decode(errors='ignore').lower()
                    text += tokenizer.tokenize(line)
                else:
                    text += tokenizer.tokenize(line.decode(errors='ignore'))
        if stemming:
            for i in range(len(text)):
#                 if text[i] in bad_words:
#                     continue
                text[i] = porter_stemmer.stem(text[i])
        X0.append(text)
        count = count + 1
    return X0

In [15]:
def loadDataset(directory, stemming, lower_case):
    positive = loadDir(directory + '/pos/',stemming, lower_case)
    negative = loadDir(directory + '/neg/',stemming, lower_case)
    random.shuffle(positive)
    boundaryTrain = math.floor(0.8 * len(positive))
    trainPos = positive[:boundaryTrain]
    random.shuffle(negative)
    trainNeg = negative[:boundaryTrain]
    combinedTrain = trainPos + trainNeg
    length = len(trainPos) + len(trainNeg)
    labelsTrain = len(trainNeg) * [1] + len(trainNeg) * [0]
    labelsTrain = np.array(labelsTrain)

    testPos = positive[boundaryTrain:]
    testNeg = negative[boundaryTrain:]
    combinedTest = testPos + testNeg
    labelsTest = len(testPos) * [1] + len(testNeg) * [0]
    labelsTest = np.array(labelsTest)
    return combinedTrain, labelsTrain, combinedTest, labelsTest

In [16]:

from nltk.corpus import stopwords
import time
import math

STEMMING = True
LOWER_CASE = True

def naiveBayes(train_set, train_labels, dev_set, smoothing_parameter, pos_prior):
    """
    train_set - List of list of words corresponding with each movie review
    example: suppose I had two reviews 'like this movie' and 'i fall asleep' in my training set
    Then train_set := [['like','this','movie'], ['i','fall','asleep']]

    train_labels - List of labels corresponding with train_set
    example: Suppose I had two reviews, first one was positive and second one was negative.
    Then train_labels := [1, 0]

    dev_set - List of list of words corresponding with each review that we are testing on
              It follows the same format as train_set

    smoothing_parameter - The smoothing parameter you provided with --laplace (1.0 by default)
    """
    # TODO: Write your code here
    # return predicted labels of development set
    # print("not even started yet")
    stops = stopwords.words('english') + list(string.punctuation)
    mydict = {}
    smoothing_parameter = 0.034
    posV = 0
    negV = 0
    totalposwords = 0
    totalnegwords = 0

    start = time.process_time()
#     print("creating occurences and wordlist")

    #create bag of words and number of occurences
    count = 0
    for x in train_set:
        rating = train_labels[count]
        count += 1
        if(rating):
            for y in x:
                if y not in mydict and y not in stops:
                    mydict[y] = [1,0] #default [1 pos, 0 neg]
                    posV += 1
                    totalposwords += 1
                elif y not in stops:
                    if mydict[y][0] == 0:
                        posV += 1
                    mydict[y][0] += 1
                    totalposwords += 1
        else:
            for y in x:
                if y not in mydict and y not in stops:
                    mydict[y] = [0,1] #default [0 pos, 1 neg]
                    negV += 1
                    totalnegwords += 1
                elif y not in stops:
                    if mydict[y][1] == 0:
                        negV += 1
                    mydict[y][1] += 1
                    totalnegwords += 1
#     print("review count is: ", count)
#     print("posV", posV)
#     print("negV", negV)
#     print("total word count is:", totalposwords + totalnegwords)
#     print("Going through train set took: ", time.process_time() - start)

    
    #come up with the bag of words unigram model
    probWordPos = {}
    probWordNeg = {}
    # PosII = 0
    # NegII = 0
    start = time.process_time()
#     print("calculate prob")
    for x in mydict:
        #use laplace smoothing
        # count(W) + a / n + a * (V+1)
        # n = number of words in our UK training data
        # count(W) = number of times W appeared in UK training data
        # α is a tuning constant between 0 and 1 (typically small)
        # V = number of word TYPES seen in training data

        probWordPos[x] = math.log((mydict[x][0] + smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
        probWordNeg[x] = math.log((mydict[x][1] + smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))
        #calculate that II symbol that is basically summation, but using mutiplication
        #logs are used because we are working with incredibly small numbers
        # PosII += (probWordPos[x])
        # NegII += (probWordNeg[x])
    

    #unneccessary calculations on prob of train set
#     print("Prob calculations: ", time.process_time() - start)
    # print("PosII", PosII)
    # print("NegII", NegII)
    # probPos = math.log(pos_prior) + PosII
    # probNeg = math.log(1 - pos_prior) + NegII

    # print("positive", probPos)
    # print("negative", probNeg)


    start = time.process_time()
    # #multiply by (add log) the pos prior, which is the other part of our equation in the unigram model
    # time to work with the dev set
    predictions = []
    for x in range(len(dev_set)):
        chancePos = math.log(pos_prior)
        chanceNeg = math.log(1-pos_prior)
        for y in range(len(dev_set[x])):
            if dev_set[x][y] in mydict:
                chancePos += probWordPos[dev_set[x][y]]
                chanceNeg += probWordNeg[dev_set[x][y]]
            # else:
                # chancePos += math.log((smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
                # chanceNeg += math.log((smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))
        if(chancePos > chanceNeg):
            predictions.append(1)
        else:
            predictions.append(0)
#     print("devset time took:", time.process_time() - start)
    return predictions

In [17]:
######################### Mixed Model approach (unigram+bigram) ###################
# def naiveBayes(train_set, train_labels, dev_set, smoothing_parameter, pos_prior):
#     """
#     train_set - List of list of words corresponding with each movie review
#     example: suppose I had two reviews 'like this movie' and 'i fall asleep' in my training set
#     Then train_set := [['like','this','movie'], ['i','fall','asleep']]

#     train_labels - List of labels corresponding with train_set
#     example: Suppose I had two reviews, first one was positive and second one was negative.
#     Then train_labels := [1, 0]

#     dev_set - List of list of words corresponding with each review that we are testing on
#               It follows the same format as train_set

#     smoothing_parameter - The smoothing parameter you provided with --laplace (1.0 by default)
#     """
#     begin = time.process_time()
#     stops = stopwords.words('english') + list(string.punctuation)
#     mydict = {}
#     mybidict = {}
#     smoothing_parameter = 0.21
#     smoothing_parameter_bi = 0.75
#     posV = 0
#     posVbi = 0
#     negV = 0
#     posVbi = 0
#     negVbi = 0
#     totalposwords = 0
#     totalposwordsbi = 0
#     totalnegwords = 0
#     totalnegwordsbi = 0

#     print("creating occurences and wordlist")
#     #create bag of words and number of occurences

#     start = time.process_time()
#     count = 0
#     for x in train_set:
#         rating = train_labels[count]
#         count += 1
#         if(rating):
#             for y in x:
#                 if y in stops:
#                     continue
#                 if y not in mydict:
#                     mydict[y] = [1,0] #default [1 pos, 0 neg]
#                     posV += 1
#                     totalposwords += 1
#                 else:
#                     if mydict[y][0] == 0:
#                         posV += 1
#                     mydict[y][0] += 1
#                     totalposwords += 1
#             for y,z in zip(x, x[1:]):
#                 if y in stops or z in stops:
#                     continue
#                 if (y,z) not in mybidict:
#                     mybidict[(y,z)] = [1,0]
#                     posVbi += 1
#                     totalposwordsbi += 1
#                 else:
#                     if mybidict[(y,z)][0] == 0:
#                         posVbi += 1
#                     mybidict[(y,z)][0] += 1
#                     totalposwordsbi += 1
#                     # print ("[x,y] : ", [x,y])
#         else:
#             for y in x:
#                 if y in stops:
#                     continue
#                 if y not in mydict:
#                     mydict[y] = [0,1] #default [0 pos, 1 neg]
#                     negV += 1
#                     totalnegwords += 1
#                 else:
#                     if mydict[y][1] == 0:
#                         negV += 1
#                     mydict[y][1] += 1
#                     totalnegwords += 1
#             for y,z in zip(x, x[1:]):
#                 if y in stops or z in stops:
#                     continue
#                 if (y,z) not in mybidict:
#                     # print("(",y,",", z, ")")
#                     mybidict[(y,z)] = [0,1]
#                     negVbi += 1
#                     totalnegwordsbi += 1
#                 else:
#                     if mybidict[(y,z)][1] == 0:
#                         negVbi += 1
#                     mybidict[(y,z)][1] += 1
#                     totalnegwordsbi += 1
#     print("review count is: ", count)
#     print("posV", posV)
#     print("negV", negV)
#     print("total word count is:", totalposwords + totalnegwords)
#     print("posVbi", posVbi)
#     print("negVbi", negVbi)
#     print("total bi pair count is:", totalposwordsbi  + totalnegwordsbi)
#     print("Going through train took: ", time.process_time() - start)

    
#     #come up with the bag of words
#     probWordPos = {}
#     probWordNeg = {}
#     # PosII = 0
#     # NegII = 0
#     # print("calculate prob")
#     for x in mydict:
#         #use laplace smoothing
#         # count(W) + a / n + a * (V+1)
#         # n = number of words in our UK training data
#         # count(W) = number of times W appeared in UK training data
#         # α is a tuning constant between 0 and 1 (typically small)
#         # V = number of word TYPES seen in training data

#         probWordPos[x] = math.log((mydict[x][0] + smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
#         probWordNeg[x] = math.log((mydict[x][1] + smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))
#         #calculate that II symbol that is basically summation, but using mutiplication
#         #logs are used because we are working with incredibly small numbers
#         # PosII += (probWordPos[x])
#         # NegII += (probWordNeg[x])
#     # start = time.process_time()
#     probPairPos = {}
#     probPairNeg = {}
#     for x in mybidict:
#         probPairPos[x] = math.log((mybidict[x][0] + smoothing_parameter_bi) / (totalposwordsbi + smoothing_parameter_bi * (posVbi + 1)))
#         probPairNeg[x] = math.log((mybidict[x][1] + smoothing_parameter_bi) / (totalnegwordsbi + smoothing_parameter_bi * (negVbi + 1)))
#     #unneccessary calculations on prob of train set
#     # print("time to print bi dict", time.process_time() - start)
#     # print("PosII", PosII)
#     # print("NegII", NegII)
#     # probPos = math.log(pos_prior) + PosII
#     # probNeg = math.log(1 - pos_prior) + NegII

#     # print("positive", probPos)
#     # print("negative", probNeg)
    
#     start = time.process_time()
#     # #multiply by (add log) the pos prior, which is the other part of our equation in the unigram    model
#     # time to work with the dev set
#     predictions = []
#     lambd = 0.475
#     lambdaUni = lambd
#     lambdaBi = 1 - lambd
#     for x in range(len(dev_set)):
#         chancePosUni = math.log(pos_prior)
#         chanceNegUni = math.log(1-pos_prior)
#         chancePosBi = math.log(pos_prior)
#         chanceNegBi = math.log(1-pos_prior)
#         for y in range(len(dev_set[x])):
#             if dev_set[x][y] in mydict:
#                 chancePosUni += probWordPos[dev_set[x][y]]
#                 chanceNegUni += probWordNeg[dev_set[x][y]]
#             # else:
#                 # chancePos += math.log((smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
#                 # chanceNeg += math.log((smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))
#         for y,z in zip(dev_set[x], dev_set[x][1:]):
#             if (y,z) in mybidict:
#                 chancePosBi += probPairPos[(y,z)]
#                 chanceNegBi += probPairNeg[(y,z)]
#         chancePos = (lambdaBi * chancePosBi) + (lambdaUni * chancePosUni)
#         chanceNeg = (lambdaBi * chanceNegBi) + (lambdaUni * chanceNegUni)
#         if(chancePos > chanceNeg):
#             predictions.append(1)
#         else:
#             predictions.append(0)
#     print("devset time took:", time.process_time() - start)
#     print("method took:", time.process_time() - begin)
#     return predictions

In [18]:
def compute_accuracies(predictedLabels, dev_set, dev_labels):
    yhats = predictedLabels
    accuracy = np.mean(yhats == dev_labels)
    tp = np.sum([yhats[i] == dev_labels[i] and yhats[i] == 1 for i in range(len(yhats))])
    precision = tp / np.sum([yhats[i] == 1 for i in range(len(yhats))])
    recall = tp / (np.sum([yhats[i] != dev_labels[i] and yhats[i] == 0 for i in range(len(yhats))]) + tp)
    f1 = 2 * (precision * recall) / (precision + recall)
    return accuracy, f1, precision, recall

In [19]:
def main(dataset, stemming, lowerCase, laplace, posPrior):
    trainSet, trainLabels, revSet, revLabels = loadDataset(dataset, stemming, lowerCase)
    predictedLabels = naiveBayes(trainSet, trainLabels, revSet, laplace, posPrior)

    accuracy, f1, precision, recall = compute_accuracies(predictedLabels, revSet, revLabels)
#     print("Accuracy:",accuracy)
#     print("F1-Score:",f1)
#     print("Precision:",precision)
#     print("Recall:",recall)
    return accuracy, f1, precision, recall

In [22]:
if __name__ == "__main__":
    dataset = "../TermProject/txt_sentoken"
    stemming = []
    lowerCase = True
    laplace = 1.0
    posPrior = 0.8
    accuracy = []
    f1 = []
    precision = []
    recall = []
    numberOfRuntimes = 10
    for i in range(numberOfRuntimes):
        curaccuracy, curf1, curprecision, currecall = main(dataset, stemming, lowerCase, laplace, posPrior)
        accuracy.append(curaccuracy)
        f1.append(curf1)
        precision.append(curprecision)
        recall.append(currecall)
        print("RUN NUMBER " + str(i+1) + " ---------------")
        print("Accuracy:",curaccuracy)
        print("F1-Score:",curf1)
        print("Precision:",curprecision)
        print("Recall:",currecall)
    aveAccuracy = np.mean(accuracy)
    avef1 = np.mean(f1)
    avePrecision = np.mean(precision)
    aveRecall = np.mean(recall)
    stdAccuracy = np.std(accuracy)
    stdf1 = np.std(f1)
    stdPrecision = np.std(precision)
    stdRecall = np.std(recall)
    print("Final results----------------------------------")
    print("Average Accuracy:", aveAccuracy)
    print("Average F1:", avef1)
    print("Average Precision:", avePrecision)
    print("Average recall", aveRecall)
    print("STD Accuracy:", stdAccuracy)
    print("STD F1:", stdf1)
    print("STD Precision:", stdPrecision)
    print("STD Recall:", stdRecall)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2666.58it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3075.68it/s]


RUN NUMBER 1 ---------------
Accuracy: 0.8
F1-Score: 0.7989949748743719
Precision: 0.803030303030303
Recall: 0.795


100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2364.80it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2832.42it/s]


RUN NUMBER 2 ---------------
Accuracy: 0.8325
F1-Score: 0.8312342569269522
Precision: 0.8375634517766497
Recall: 0.825


100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2966.52it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2931.80it/s]


RUN NUMBER 3 ---------------
Accuracy: 0.81
F1-Score: 0.8080808080808082
Precision: 0.8163265306122449
Recall: 0.8


100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3066.29it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3308.89it/s]


RUN NUMBER 4 ---------------
Accuracy: 0.785
F1-Score: 0.788177339901478
Precision: 0.7766990291262136
Recall: 0.8


100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2709.93it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3224.05it/s]


RUN NUMBER 5 ---------------
Accuracy: 0.775
F1-Score: 0.7783251231527093
Precision: 0.7669902912621359
Recall: 0.79


100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2033.68it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1920.83it/s]


RUN NUMBER 6 ---------------
Accuracy: 0.8
F1-Score: 0.8029556650246306
Precision: 0.7912621359223301
Recall: 0.815


100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3002.03it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2906.30it/s]


RUN NUMBER 7 ---------------
Accuracy: 0.7875
F1-Score: 0.7880299251870324
Precision: 0.7860696517412935
Recall: 0.79


100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2785.22it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3075.67it/s]


RUN NUMBER 8 ---------------
Accuracy: 0.7575
F1-Score: 0.7581047381546135
Precision: 0.7562189054726368
Recall: 0.76


100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2800.78it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2914.76it/s]


RUN NUMBER 9 ---------------
Accuracy: 0.76
F1-Score: 0.7611940298507464
Precision: 0.7574257425742574
Recall: 0.765


100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2638.44it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2975.31it/s]


RUN NUMBER 10 ---------------
Accuracy: 0.75
F1-Score: 0.7487437185929648
Precision: 0.7525252525252525
Recall: 0.745
Final results----------------------------------
Average Accuracy: 0.78575
Average F1: 0.7863840579746307
Average Precision: 0.7844111294043318
Average recall 0.7885
STD Accuracy: 0.02457259652539798
STD F1: 0.024161677220446418
STD Precision: 0.026726029727975897
STD Recall: 0.023669600757089244
