In [132]:
import sys
import time
import math
import copy
import random
import string
import glob
import numpy as np
import nltk.sentiment.util
from tqdm import tqdm
from os import listdir
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.metrics import *

dataset = "../TermProject/txt_sentoken"
stemming = False
lowerCase = True
stop = True
alpha = True
negation = True
laplace = 1.0
posPrior = 0.8

In [133]:
def loadReviewNames(posDir, negDir):
    positive = glob.glob(posDir)
    negative = glob.glob(negDir)
#     print(positive)
#     print(negative)
    return positive, negative

In [134]:
#loads dataset in a way that will work with scikit
def getPosNegReviews(directory):
    positive, negative = loadReviewNames(directory + '/pos/*',directory + '/neg/*')
    random.shuffle(positive)
    boundaryTrain = math.floor(0.8 * len(positive))
    trainPos = positive[:boundaryTrain]
    random.shuffle(negative)
    trainNeg = negative[:boundaryTrain]
    
    testPos = positive[boundaryTrain:]
    testNeg = negative[boundaryTrain:]
    return trainPos,trainNeg,testPos,testNeg

In [135]:
def loadDir(name,stemming,lower_case):
    # Loads the files in the folder and returns a list of lists of words from the text in each file
    if stemming:
        porter_stemmer = PorterStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    data = []
    count = 0
    for f in tqdm(listdir(name)):
        fullname = name+f
        text = []
        with open(fullname, 'rb') as f:
            for line in f:
                if lower_case:
                    line = line.decode(errors='ignore').lower()
                    text += tokenizer.tokenize(line)
                else:
                    text += tokenizer.tokenize(line.decode(errors='ignore'))
        if stemming:
            for i in range(len(text)):
#                 if text[i] in bad_words:
#                     continue
                text[i] = porter_stemmer.stem(text[i])
        data.append(text)
        count = count + 1
    return data

In [136]:
#loads dataset in a way that will work with my unigram Naive bayes implementation
def loadDatasetNB(directory, stemming, lower_case):
    positive = loadDir(directory + '/pos/',stemming, lower_case)
    negative = loadDir(directory + '/neg/',stemming, lower_case)
    random.shuffle(positive)
    boundaryTrain = math.floor(0.8 * len(positive))
    trainPos = positive[:boundaryTrain]
    random.shuffle(negative)
    trainNeg = negative[:boundaryTrain]
    combinedTrain = trainPos + trainNeg
    length = len(trainPos) + len(trainNeg)
    labelsTrain = len(trainNeg) * [1] + len(trainNeg) * [0]
    labelsTrain = np.array(labelsTrain)

    testPos = positive[boundaryTrain:]
    testNeg = negative[boundaryTrain:]
    combinedTest = testPos + testNeg
    labelsTest = len(testPos) * [1] + len(testNeg) * [0]
    labelsTest = np.array(labelsTest)
    return combinedTrain, labelsTrain, combinedTest, labelsTest

In [137]:
    ############  running Naive Bayes #############

In [138]:
#helper function for unigram Naive bayes implementation
def bagOfWordsNB(train_set, train_labels):
    stops = stopwords.words('english') + list(string.punctuation)
    mydict = {}
    smoothing_parameter = 0.034
    posV = 0
    negV = 0
    totalposwords = 0
    totalnegwords = 0

    start = time.process_time()
#     print("creating occurences and wordlist")

    #create bag of words and number of occurences
    count = 0
    for x in train_set:
        rating = train_labels[count]
        count += 1
        if(rating):
            for y in x:
                if y not in mydict and y not in stops:
                    mydict[y] = [1,0] #default [1 pos, 0 neg]
                    posV += 1
                    totalposwords += 1
                elif y not in stops:
                    if mydict[y][0] == 0:
                        posV += 1
                    mydict[y][0] += 1
                    totalposwords += 1
        else:
            for y in x:
                if y not in mydict and y not in stops:
                    mydict[y] = [0,1] #default [0 pos, 1 neg]
                    negV += 1
                    totalnegwords += 1
                elif y not in stops:
                    if mydict[y][1] == 0:
                        negV += 1
                    mydict[y][1] += 1
                    totalnegwords += 1
#     print("review count is: ", count)
#     print("posV", posV)
#     print("negV", negV)
#     print("total word count is:", totalposwords + totalnegwords)
    BOW = mydict, posV, negV, totalposwords, totalnegwords
    return BOW

In [139]:
def naiveBayes(train_set, train_labels, dev_set, smoothing_parameter, pos_prior):
    #Baseline#
    # return predicted labels of development set
    # print("not even started yet")
    smoothing_parameter = 0.034

#     start = time.process_time()

#     print("Going through train set took: ", time.process_time() - start)
    mydict, posV, negV, totalposwords, totalnegwords = bagOfWordsNB(train_set, train_labels)
    
    
    #come up with the bag of words unigram model
    probWordPos = {}
    probWordNeg = {}

    for x in mydict:
        #use laplace smoothing
        # count(W) + a / n + a * (V+1)
        # n = number of words in our training data
        # count(W) = number of times W appeared in training data
        # α is a tuning constant between 0 and 1 (typically small)
        # V = number of word TYPES seen in training data

        probWordPos[x] = math.log((mydict[x][0] + smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
        probWordNeg[x] = math.log((mydict[x][1] + smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))
        
    start = time.process_time()

    # dev set
    predictions = []
    for x in range(len(dev_set)):
        chancePos = math.log(pos_prior)
        chanceNeg = math.log(1-pos_prior)
        for y in range(len(dev_set[x])):
            if dev_set[x][y] in mydict:
                chancePos += probWordPos[dev_set[x][y]]
                chanceNeg += probWordNeg[dev_set[x][y]]
            # else:
                # chancePos += math.log((smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
                # chanceNeg += math.log((smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))
        if(chancePos > chanceNeg):
            predictions.append(1)
        else:
            predictions.append(0)
#     print("devset time took:", time.process_time() - start)
    return predictions

In [140]:
def compute_accuracies(predictedLabels, dev_set, dev_labels):
    yhats = predictedLabels
    accuracy = np.mean(yhats == dev_labels)
    tp = np.sum([yhats[i] == dev_labels[i] and yhats[i] == 1 for i in range(len(yhats))])
    precision = tp / np.sum([yhats[i] == 1 for i in range(len(yhats))])
    recall = tp / (np.sum([yhats[i] != dev_labels[i] and yhats[i] == 0 for i in range(len(yhats))]) + tp)
    f1 = 2 * (precision * recall) / (precision + recall)
    return accuracy, f1, precision, recall

In [141]:
numberOfRuntimes = 3
##Naive bayes
accuracyNB = []
f1NB = []
precisionNB = []
recallNB = []
for i in range(numberOfRuntimes):    
    trainSet, trainLabels, revSet, revLabels = loadDatasetNB(dataset, stemming, lowerCase)
    predictedLabels = naiveBayes(trainSet, trainLabels, revSet, laplace, posPrior)
    NBscores = compute_accuracies(predictedLabels, revSet, revLabels)
    accuracyNB.append(NBscores[0])
    f1NB.append(NBscores[1])
    precisionNB.append(NBscores[2])
    recallNB.append(NBscores[3])
#     print("RUN NUMBER " + str(i+1) + " ---------------")
#     print("Accuracy:",NBscores[0])
#     print("F1-Score:",NBscores[1])
#     print("Precision:",NBscores[2])
#     print("Recall:",NBscores[3])

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2033.66it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2029.73it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2966.25it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3163.00it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3019.84it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3075.71it/s]


In [142]:
    aveAccuracy = np.mean(accuracyNB)
    avef1 = np.mean(f1NB)
    avePrecision = np.mean(precisionNB)
    aveRecall = np.mean(recallNB)
    stdAccuracy = np.std(accuracyNB)
    stdf1 = np.std(f1NB)
    stdPrecision = np.std(precisionNB)
    stdRecall = np.std(recallNB)
    print("Final results NAIVE BAYES----------------------------------")
    print("Average Accuracy:", aveAccuracy)
    print("Average F1:", avef1)
    print("Average Precision:", avePrecision)
    print("Average recall", aveRecall)
    print("STD Accuracy:", stdAccuracy)
    print("STD F1:", stdf1)
    print("STD Precision:", stdPrecision)
    print("STD Recall:", stdRecall)

Final results NAIVE BAYES----------------------------------
Average Accuracy: 0.7799999999999999
Average F1: 0.7808754208754211
Average Precision: 0.7771201814058957
Average recall 0.785
STD Accuracy: 0.012247448713915901
STD F1: 0.015270316141720093
STD Precision: 0.004762012740380342
STD Recall: 0.02677063067368166


In [143]:
################ pre processing for Scikit models ################    

In [144]:
    trainPos,trainNeg,testPos,testNeg = getPosNegReviews(dataset)

In [145]:
def create_bag_of_words(fileName, reviews):
    word_bag = {}
    stops = stopwords.words('english') + list(string.punctuation)
    for review in reviews:
        with open(review, 'r') as f:
            line = f.read()
            line = line.split(" ")
            if stop:
                line = [word for word in line if word not in set(stops)]
            if alpha:
                line = [word for word in line if word.isalpha()]
            for word in line:
                word_bag[word] = word_bag.get(word, 0) + 1
    with open(fileName, 'w') as f:
        for _ in sorted(word_bag, key=word_bag.get, reverse=True):
            f.write("%s %d\n" % (_, word_bag[_]))
    return word_bag

In [146]:
    nWbag = create_bag_of_words("negBag.txt",trainNeg)
#     print(len(nWbag.keys()))
#     print(nWbag.keys())
#     print(nWbag)

In [147]:
#     pWbag = create_bag_of_words("posBag.txt",trainPos)
#     print(len(pWbag.keys()))
#     print(pWbag.keys())
#     print(pWbag)

In [148]:
def statistically_split_word_lists(pWbag, nWbag):
    posList = {}
    negList = {}
    thresh = 1.2
    for key in nWbag.keys():
        if key in pWbag.keys():
            posFreq = pWbag[key]
            negFreq = nWbag[key]
            if int(posFreq) >= thresh*int(negFreq):
                posList[key] = pWbag[key]
            elif int(posFreq) < thresh*int(negFreq):
                negList[key] = nWbag[key]
        else:
            negList[key] = nWbag[key]
    for key in pWbag.keys():
        if key not in nWbag.keys():
            posList[key] = pWbag[key]
    print("first filter")
    print(len(posList))
    print(len(negList))
    #determine the boundary size
    max_length=min(len(negList),len(posList))
    print("max_length")
    print(max_length)
    #sort the keys of the map into a list, sorting to get most frequently used words
    sortedNegList = sorted(negList, key=negList.get, reverse=True)[:max_length]
    sortedPosList = sorted(posList, key=posList.get, reverse=True)[:max_length]
    print("sorted max filter")
    print(len(sortedNegList))
    print(len(sortedPosList))
    #turn the sorted list into a usable mapping again
    new_neg= {k:negList[k] for k in sortedNegList}
    new_pos = {k:posList[k] for k in sortedPosList}
    print("k filter")
    print(len(new_pos))
    print(len(new_neg))
    return new_neg,new_pos

In [149]:
    neg_keys, pos_keys = statistically_split_word_lists(nWbag, pWbag)

first filter
12830
21301
max_length
12830
sorted max filter
12830
12830
k filter
12830
12830


In [150]:
def generate_svm_featureset(neg_bow,pos_bow):
    review_word_index = []
    for word in neg_bow.keys():
        review_word_index.append(word)
    for word in pos_bow.keys():
        review_word_index.append(word)
    review_word_index = sorted(list(set(review_word_index)))
    return review_word_index

In [151]:
    vocab = generate_svm_featureset(pWbag,nWbag)
    print(vocab)






In [152]:
def myTokenize(line, stop=True, negation=True):
    line = line.split(" ")
    stops = stopwords.words('english') + list(string.punctuation)
    if stop:
        line = [word for word in line if word not in set(stops)]
    if alpha:
        line = [word for word in line if word.isalpha()]
#     if negation:
#         line = nltk.sentiment.util.mark_negation(line)
    return line

In [153]:
    cv = CountVectorizer(input='filename', tokenizer=myTokenize, lowercase=True, vocabulary=vocab)
    print(cv)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='filename',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function myTokenize at 0x000002C7269C1950>,
                vocabulary=['aaa', 'aahs', 'aaliyah', 'aalyah', 'aamir',
                            'aardman', 'aaron', 'aatish', 'ab', 'aback',
                            'abandon', 'abandoned', 'abandoning', 'abandonment',
                            'abandons', 'abba', 'abbe', 'abberation',
                            'abberline', 'abbots', 'abbott', 'abbotts', 'abby',
                            'abc', 'abdomen', 'abducted', 'abductees',
                            'abduction', 'abductions', 'abe', ...])


In [154]:
    trainFileNames = trainPos + trainNeg
    testFileNames = testPos + testNeg

In [155]:
    trainSet = cv.fit_transform(trainFileNames)

In [156]:
    testSet = cv.fit_transform(testFileNames)

In [157]:
def compute_accuracies(predictedLabels):
    dev_labels = [1]*200+[0]*200
    yhats = predictedLabels
    accuracy = np.mean(yhats == dev_labels)
    tp = np.sum([yhats[i] == dev_labels[i] and yhats[i] == 1 for i in range(len(yhats))])
    precision = tp / np.sum([yhats[i] == 1 for i in range(len(yhats))])
    recall = tp / (np.sum([yhats[i] != dev_labels[i] and yhats[i] == 0 for i in range(len(yhats))]) + tp)
    f1 = 2 * (precision * recall) / (precision + recall)
    return accuracy, f1, precision, recall

In [158]:
    ###################### Logistic regression run ###########################

In [159]:
def logisticRegression(trainSet, testSet):
    #consider using strip_accents
    LRclassifier = LogisticRegression()
    labels = [1]*800+[0]*800
    LRclassifier.fit(trainSet,labels)
    predictions = LRclassifier.predict(testSet)
    return predictions

In [160]:
    predictedLabelsLR = logisticRegression(trainSet, testSet)
    testLabels = [1]*200+[0]*200
    print(len(predictedLabelsLR))
    print(predictedLabelsLR)

400
[1 1 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1
 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 0 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1
 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0
 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0]


In [161]:
    LRscores = compute_accuracies(predictedLabelsLR)

In [162]:
    print("Accuracy:",LRscores[0])
    print("F1-Score:",LRscores[1])
    print("Precision:",LRscores[2])
    print("Recall:",LRscores[3])

Accuracy: 0.84
F1-Score: 0.8423645320197045
Precision: 0.8300970873786407
Recall: 0.855


In [163]:
#################### Support Vector Machine run #########################

In [164]:
def supportVectorMachine(trainSet, testSet):
    SVMclassifier = LinearSVC()
    labels = [1]*800+[0]*800
    SVMclassifier.fit(trainSet,labels)
    predictions = SVMclassifier.predict(testSet)
    return predictions

In [165]:
    predictedLabelsSVM = supportVectorMachine(trainSet, testSet)

In [166]:
    print(predictedLabelsSVM)

[1 1 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 0 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 1 1 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0]


In [167]:
    SVMscores = compute_accuracies(predictedLabelsSVM)

In [168]:
    print("Accuracy:",SVMscores[0])
    print("F1-Score:",SVMscores[1])
    print("Precision:",SVMscores[2])
    print("Recall:",SVMscores[3])

Accuracy: 0.85
F1-Score: 0.8492462311557788
Precision: 0.8535353535353535
Recall: 0.845


In [172]:
########################### Decision Tree run ###############################

In [175]:
def decisionTree(trainSet, testSet):
    SVMclassifier = DecisionTreeClassifier()
    labels = [1]*800+[0]*800
    SVMclassifier.fit(trainSet,labels)
    predictions = SVMclassifier.predict(testSet)
    return predictions

In [176]:
    predictedLabelsDT = decisionTree(trainSet, testSet)

In [177]:
    DTscores = compute_accuracies(predictedLabelsDT)

In [178]:
    print("Accuracy:",DTscores[0])
    print("F1-Score:",DTscores[1])
    print("Precision:",DTscores[2])
    print("Recall:",DTscores[3])

Accuracy: 0.645
F1-Score: 0.6650943396226414
Precision: 0.6294642857142857
Recall: 0.705
