In [234]:
import sys
import time
import math
import copy
import random
import string
import numpy as np
from tqdm import tqdm
from os import listdir
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


# nltk.download('stopwords')

In [235]:
def loadDataset(directory, stemming, lower_case):
    positive = loadDir(directory + '/pos/',stemming, lower_case)
    negative = loadDir(directory + '/neg/',stemming, lower_case)
    random.shuffle(positive)
    boundaryTrain = math.floor(0.8 * len(positive))
    trainPos = positive[:boundaryTrain]
    random.shuffle(negative)
    trainNeg = negative[:boundaryTrain]
    combinedTrain = trainPos + trainNeg
    length = len(trainPos) + len(trainNeg)
    labelsTrain = len(trainNeg) * [1] + len(trainNeg) * [0]
    labelsTrain = np.array(labelsTrain)

    testPos = positive[boundaryTrain:]
    testNeg = negative[boundaryTrain:]
    combinedTest = testPos + testNeg
    labelsTest = len(testPos) * [1] + len(testNeg) * [0]
    labelsTest = np.array(labelsTest)
    print(labelsTrain)
    return combinedTrain, labelsTrain, combinedTest, labelsTest

In [236]:
def loadDir(name,stemming,lower_case):
    # Loads the files in the folder and returns a list of lists of words from the text in each file
    if stemming:
        porter_stemmer = PorterStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    data = []
    count = 0
    for f in tqdm(listdir(name)):
        fullname = name+f
        text = []
        with open(fullname, 'rb') as f:
            for line in f:
                if lower_case:
                    line = line.decode(errors='ignore').lower()
                    text += tokenizer.tokenize(line)
                else:
                    text += tokenizer.tokenize(line.decode(errors='ignore'))
        if stemming:
            for i in range(len(text)):
#                 if text[i] in bad_words:
#                     continue
                text[i] = porter_stemmer.stem(text[i])
        data.append(text)
        count = count + 1
    return data

In [237]:
def generate_svm_featureset(neg_bow,pos_bow):
    review_word_index = []
    for word in neg_bow.keys():
        review_word_index.append(word)
    for word in pos_bow.keys():
        review_word_index.append(word)
    review_word_index = sorted(list(set(review_word_index)))
    return review_word_index

In [238]:
def bagOfWords(train_set, train_labels):
    stops = stopwords.words('english') + list(string.punctuation)
    count = 0
    mydict = {}
    posV = 0
    negV = 0
    totalposwords = 0
    totalnegwords = 0
    for x in train_set:
        rating = train_labels[count]
        count += 1
        if(rating):
            for y in x:
                if y not in mydict and y not in stops:
                    mydict[y] = [1,0] #default [1 pos, 0 neg]
                    posV += 1
                    totalposwords += 1
                elif y not in stops:
                    if mydict[y][0] == 0:
                        posV += 1
                    mydict[y][0] += 1
                    totalposwords += 1
        else:
            for y in x:
                if y not in mydict and y not in stops:
                    mydict[y] = [0,1] #default [0 pos, 1 neg]
                    negV += 1
                    totalnegwords += 1
                elif y not in stops:
                    if mydict[y][1] == 0:
                        negV += 1
                    mydict[y][1] += 1
                    totalnegwords += 1
#     print("review count is: ", count)
#     print("posV", posV)
#     print("negV", negV)
#     print("total word count is:", totalposwords + totalnegwords)
    BOW = mydict, posV, negV, totalposwords, totalnegwords
    return BOW

In [239]:
def naiveBayes(train_set, train_labels, dev_set, smoothing_parameter, pos_prior):
    #Baseline#
    # return predicted labels of development set
    # print("not even started yet")
    smoothing_parameter = 0.034

#     start = time.process_time()

#     print("Going through train set took: ", time.process_time() - start)
    mydict, posV, negV, totalposwords, totalnegwords = bagOfWords(train_set, train_labels)
    
    
    #come up with the bag of words unigram model
    probWordPos = {}
    probWordNeg = {}
#     start = time.process_time()
#     print("calculate prob")
    for x in mydict:
        #use laplace smoothing
        # count(W) + a / n + a * (V+1)
        # n = number of words in our UK training data
        # count(W) = number of times W appeared in UK training data
        # α is a tuning constant between 0 and 1 (typically small)
        # V = number of word TYPES seen in training data

        probWordPos[x] = math.log((mydict[x][0] + smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
        probWordNeg[x] = math.log((mydict[x][1] + smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))
        #calculate that II symbol that is basically summation, but using mutiplication
        #logs are used because we are working with incredibly small numbers
        # PosII += (probWordPos[x])
        # NegII += (probWordNeg[x])
    

    #unneccessary calculations on prob of train set
#     print("Prob calculations: ", time.process_time() - start)
    # print("PosII", PosII)
    # print("NegII", NegII)
    # probPos = math.log(pos_prior) + PosII
    # probNeg = math.log(1 - pos_prior) + NegII

    # print("positive", probPos)
    # print("negative", probNeg)


    start = time.process_time()
    # #multiply by (add log) the pos prior, which is the other part of our equation in the unigram model
    # time to work with the dev set
    predictions = []
    for x in range(len(dev_set)):
        chancePos = math.log(pos_prior)
        chanceNeg = math.log(1-pos_prior)
        for y in range(len(dev_set[x])):
            if dev_set[x][y] in mydict:
                chancePos += probWordPos[dev_set[x][y]]
                chanceNeg += probWordNeg[dev_set[x][y]]
            # else:
                # chancePos += math.log((smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
                # chanceNeg += math.log((smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))
        if(chancePos > chanceNeg):
            predictions.append(1)
        else:
            predictions.append(0)
#     print("devset time took:", time.process_time() - start)
    return predictions

In [240]:
######################### Mixed Model approach (unigram+bigram) ###################
# def naiveBayes(train_set, train_labels, dev_set, smoothing_parameter, pos_prior):
#     """
#     train_set - List of list of words corresponding with each movie review
#     example: suppose I had two reviews 'like this movie' and 'i fall asleep' in my training set
#     Then train_set := [['like','this','movie'], ['i','fall','asleep']]

#     train_labels - List of labels corresponding with train_set
#     example: Suppose I had two reviews, first one was positive and second one was negative.
#     Then train_labels := [1, 0]

#     dev_set - List of list of words corresponding with each review that we are testing on
#               It follows the same format as train_set

#     smoothing_parameter - The smoothing parameter you provided with --laplace (1.0 by default)
#     """
#     begin = time.process_time()
#     stops = stopwords.words('english') + list(string.punctuation)
#     mydict = {}
#     mybidict = {}
#     smoothing_parameter = 0.21
#     smoothing_parameter_bi = 0.75
#     posV = 0
#     posVbi = 0
#     negV = 0
#     posVbi = 0
#     negVbi = 0
#     totalposwords = 0
#     totalposwordsbi = 0
#     totalnegwords = 0
#     totalnegwordsbi = 0

#     print("creating occurences and wordlist")
#     #create bag of words and number of occurences

#     start = time.process_time()
#     count = 0
#     for x in train_set:
#         rating = train_labels[count]
#         count += 1
#         if(rating):
#             for y in x:
#                 if y in stops:
#                     continue
#                 if y not in mydict:
#                     mydict[y] = [1,0] #default [1 pos, 0 neg]
#                     posV += 1
#                     totalposwords += 1
#                 else:
#                     if mydict[y][0] == 0:
#                         posV += 1
#                     mydict[y][0] += 1
#                     totalposwords += 1
#             for y,z in zip(x, x[1:]):
#                 if y in stops or z in stops:
#                     continue
#                 if (y,z) not in mybidict:
#                     mybidict[(y,z)] = [1,0]
#                     posVbi += 1
#                     totalposwordsbi += 1
#                 else:
#                     if mybidict[(y,z)][0] == 0:
#                         posVbi += 1
#                     mybidict[(y,z)][0] += 1
#                     totalposwordsbi += 1
#                     # print ("[x,y] : ", [x,y])
#         else:
#             for y in x:
#                 if y in stops:
#                     continue
#                 if y not in mydict:
#                     mydict[y] = [0,1] #default [0 pos, 1 neg]
#                     negV += 1
#                     totalnegwords += 1
#                 else:
#                     if mydict[y][1] == 0:
#                         negV += 1
#                     mydict[y][1] += 1
#                     totalnegwords += 1
#             for y,z in zip(x, x[1:]):
#                 if y in stops or z in stops:
#                     continue
#                 if (y,z) not in mybidict:
#                     # print("(",y,",", z, ")")
#                     mybidict[(y,z)] = [0,1]
#                     negVbi += 1
#                     totalnegwordsbi += 1
#                 else:
#                     if mybidict[(y,z)][1] == 0:
#                         negVbi += 1
#                     mybidict[(y,z)][1] += 1
#                     totalnegwordsbi += 1
#     print("review count is: ", count)
#     print("posV", posV)
#     print("negV", negV)
#     print("total word count is:", totalposwords + totalnegwords)
#     print("posVbi", posVbi)
#     print("negVbi", negVbi)
#     print("total bi pair count is:", totalposwordsbi  + totalnegwordsbi)
#     print("Going through train took: ", time.process_time() - start)

    
#     #come up with the bag of words
#     probWordPos = {}
#     probWordNeg = {}
#     # PosII = 0
#     # NegII = 0
#     # print("calculate prob")
#     for x in mydict:
#         #use laplace smoothing
#         # count(W) + a / n + a * (V+1)
#         # n = number of words in our UK training data
#         # count(W) = number of times W appeared in UK training data
#         # α is a tuning constant between 0 and 1 (typically small)
#         # V = number of word TYPES seen in training data

#         probWordPos[x] = math.log((mydict[x][0] + smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
#         probWordNeg[x] = math.log((mydict[x][1] + smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))
#         #calculate that II symbol that is basically summation, but using mutiplication
#         #logs are used because we are working with incredibly small numbers
#         # PosII += (probWordPos[x])
#         # NegII += (probWordNeg[x])
#     # start = time.process_time()
#     probPairPos = {}
#     probPairNeg = {}
#     for x in mybidict:
#         probPairPos[x] = math.log((mybidict[x][0] + smoothing_parameter_bi) / (totalposwordsbi + smoothing_parameter_bi * (posVbi + 1)))
#         probPairNeg[x] = math.log((mybidict[x][1] + smoothing_parameter_bi) / (totalnegwordsbi + smoothing_parameter_bi * (negVbi + 1)))
#     #unneccessary calculations on prob of train set
#     # print("time to print bi dict", time.process_time() - start)
#     # print("PosII", PosII)
#     # print("NegII", NegII)
#     # probPos = math.log(pos_prior) + PosII
#     # probNeg = math.log(1 - pos_prior) + NegII

#     # print("positive", probPos)
#     # print("negative", probNeg)
    
#     start = time.process_time()
#     # #multiply by (add log) the pos prior, which is the other part of our equation in the unigram    model
#     # time to work with the dev set
#     predictions = []
#     lambd = 0.475
#     lambdaUni = lambd
#     lambdaBi = 1 - lambd
#     for x in range(len(dev_set)):
#         chancePosUni = math.log(pos_prior)
#         chanceNegUni = math.log(1-pos_prior)
#         chancePosBi = math.log(pos_prior)
#         chanceNegBi = math.log(1-pos_prior)
#         for y in range(len(dev_set[x])):
#             if dev_set[x][y] in mydict:
#                 chancePosUni += probWordPos[dev_set[x][y]]
#                 chanceNegUni += probWordNeg[dev_set[x][y]]
#             # else:
#                 # chancePos += math.log((smoothing_parameter) / (totalposwords + smoothing_parameter * (posV + 1)))
#                 # chanceNeg += math.log((smoothing_parameter) / (totalnegwords + smoothing_parameter * (negV + 1)))
#         for y,z in zip(dev_set[x], dev_set[x][1:]):
#             if (y,z) in mybidict:
#                 chancePosBi += probPairPos[(y,z)]
#                 chanceNegBi += probPairNeg[(y,z)]
#         chancePos = (lambdaBi * chancePosBi) + (lambdaUni * chancePosUni)
#         chanceNeg = (lambdaBi * chanceNegBi) + (lambdaUni * chanceNegUni)
#         if(chancePos > chanceNeg):
#             predictions.append(1)
#         else:
#             predictions.append(0)
#     print("devset time took:", time.process_time() - start)
#     print("method took:", time.process_time() - begin)
#     return predictions

In [241]:
def logisticRegression(train_set, train_labels, dev_set):
    stops = stopwords.words('english') + list(string.punctuation)
    cv = CountVectorizer(tokenizer = tokenizer, stop_words = stops)
    LRclassifier = LogisticRegression()
    trainSet = cv.fit_transform(train_set)
#     for i in range(len(train_labels)):
#         labelSet += (fillTrainset(train_set[i],train_labels[i]))
#         trainSet += train_set[i]
#     print(labelSet)
#     print(train_set)
    print(len(trainSet))
    print(len(labelSet))
    LRclassifier.fit(train_set,labelSet)
    predictions = LRclassifier.predict(dev_set)
    return predictions

In [242]:
def supportVectorMachine(train_set, train_labels, dev_set):
    SVMclassifier = LinearSVC()
    SVMclassifier.fit(train_set,train_labels)
    predictions = SVMclassifier.predict(dev_set)
    return predictions

In [243]:
def decisionTree(train_set, train_labels, dev_set):
    DTclassifier = DecisionTreeClassifier()
    DTclassifier.fit(train_set,train_labels)
    predictions = DTclassifier.predict(dev_set)
    return predictions

In [244]:
def compute_accuracies(predictedLabels, dev_set, dev_labels):
    yhats = predictedLabels
    accuracy = np.mean(yhats == dev_labels)
    tp = np.sum([yhats[i] == dev_labels[i] and yhats[i] == 1 for i in range(len(yhats))])
    precision = tp / np.sum([yhats[i] == 1 for i in range(len(yhats))])
    recall = tp / (np.sum([yhats[i] != dev_labels[i] and yhats[i] == 0 for i in range(len(yhats))]) + tp)
    f1 = 2 * (precision * recall) / (precision + recall)
    return accuracy, f1, precision, recall

In [245]:
def splitData(train_set, train_labels, dev_set, dev_labels):
    labelSet = []
    trainSet = []
    dataSet = []
    dataLabels = []
    for i in range(len(train_labels)):
        labelSet += (fillTrainset(train_set[i],train_labels[i]))
        trainSet += train_set[i]
    dataSet = train_set + dev_set
    dataLabels = train_labels + dev_labels
    trainSet, trainLabels, revSet, revLabels = train_test_split(dataSet, dataLabels, test_size=0.2)
    return trainSet, trainLabels, revSet, revLabels

In [246]:
def tokenizer(line, stop=True, negation=True):
    line = line.split(" ")
    if stop:
        line = [word for word in line if word not in set(nltk.corpus.stopwords.words('english'))]
    if negation:
        line = nltk.sentiment.util.mark_negation(line)
    return line

In [247]:
def fillTrainset(review, label):
    labelSet = []
    if label == 0:
        labelSet = [0]*len(review)
    if label == 1:
        labelSet = [1]*len(review)
    return labelSet

In [248]:
def main(dataset, stemming, lowerCase, laplace, posPrior):
    trainSet, trainLabels, revSet, revLabels = loadDataset(dataset, stemming, lowerCase)
    mydict, posV, negV, totalposwords, totalnegwords = bagOfWords(trainSet, trainLabels)
    print(mydict)
    print(len(trainSet))
    print(len(trainLabels))
    print(trainSet[0])
    print(trainSet[1])
    print(trainLabels[0])
    print(trainLabels[1])
    print(sorted(list(set(dataset))))
#     trainSet, trainLabels, revSet, revLabels = splitData(trainSet, trainLabels, revSet, revLabels)

    predictedLabelsNB = naiveBayes(trainSet, trainLabels, revSet, laplace, posPrior)
    predictedLabelsLR = logisticRegression(trainSet, trainLabels, revSet)
    predictedLabelsSVM = supportVectorMachine(trainSet, trainLabels, revSet)
    predictedLabelsDT = decisionTree(trainSet, trainLabels, revSet)
    
    accuracyNB, f1NB, precisionNB, recallNB = compute_accuracies(predictedLabelsNB, revSet, revLabels)
    accuracyLR, f1LR, precisionLR, recallLR = compute_accuracies(predictedLabelsLR, revSet, revLabels)
    accuracySVM, f1SVM, precisionSVM, recallSVM = compute_accuracies(predictedLabelsSVM, revSet, revLabels)
    accuracyDT, f1DT, precisionDT, recallDT = compute_accuracies(predictedLabelsDT, revSet, revLabels)
    
    NBscores = accuracyNB, f1NB, precisionNB, recallNB
    LRscores = accuracyLR, f1LR, precisionLR, recallLR
    SVMscores = accuracySVM, f1SVM, precisionSVM, recallSVM
    DTscores = accuracyDT, f1DT, precisionDT, recallDT
#     print("Accuracy:",accuracy)
#     print("F1-Score:",f1)
#     print("Precision:",precision)
#     print("Recall:",recall)
    return NBscores, LRscores, SVMscores, DTscores

In [249]:
if __name__ == "__main__":
    dataset = "../TermProject/txt_sentoken"
    stemming = False
    lowerCase = True
    laplace = 1.0
    posPrior = 0.8
    
    ##Naive bayes
    accuracyNB = []
    f1NB = []
    precisionNB = []
    recallNB = []
    
    #Logistic regression
    accuracyLR = []
    f1LR = []
    precisionLR = []
    recallLR = []
    
    #Support Vector Machine
    accuracySVM = []
    f1SVM = []
    precisionSVM = []
    recallSVM = []
    
    #Decision Tree
    accuracyDT = []
    f1DT = []
    precisionDT = []
    recallDT = []
    
    numberOfRuntimes = 5
    for i in range(numberOfRuntimes):
        NBscores, LRscores, SVMscores = main(dataset, stemming, lowerCase, laplace, posPrior)
        accuracyNB.append(NBscores[0])
        f1NB.append(NBscores[1])
        precisionNB.append(NBscores[2])
        recallNB.append(NBscores[3])
        
        accuracyLR.append(LRscores[0])
        f1LR.append(LRscores[1])
        precisionLR.append(LRscores[2])
        recallLR.append(LRscores[3])
        
        accuracySVM.append(SVMscores[0])
        f1SVM.append(SVMscores[1])
        precisionSVM.append(SVMscores[2])
        recallSVM.append(SVMscores[3])
        
        accuracyDT.append(DTscores[0])
        f1DT.append(DTscores[1])
        precisionDT.append(DTscores[2])
        recallDT.append(DTscores[3])
        
#         print("RUN NUMBER " + str(i+1) + " ---------------")
#         print("Accuracy:",curaccuracy)
#         print("F1-Score:",curf1)
#         print("Precision:",curprecision)
#         print("Recall:",currecall)

    #RESULTS OF NAIVE BAYES (unigram) 
    aveAccuracy = np.mean(accuracyNB)
    avef1 = np.mean(f1NB)
    avePrecision = np.mean(precisionNB)
    aveRecall = np.mean(recallNB)
    stdAccuracy = np.std(accuracyNB)
    stdf1 = np.std(f1NB)
    stdPrecision = np.std(precisionNB)
    stdRecall = np.std(recallNB)
    print("Final results NAIVE BAYES----------------------------------")
    print("Average Accuracy:", aveAccuracy)
    print("Average F1:", avef1)
    print("Average Precision:", avePrecision)
    print("Average recall", aveRecall)
    print("STD Accuracy:", stdAccuracy)
    print("STD F1:", stdf1)
    print("STD Precision:", stdPrecision)
    print("STD Recall:", stdRecall)
    
    #RESULTS OF LOGISTIC REGRESSION
    aveAccuracy = np.mean(accuracyLR)
    avef1 = np.mean(f1LR)
    avePrecision = np.mean(precisionLR)
    aveRecall = np.mean(recallLR)
    stdAccuracy = np.std(accuracyLR)
    stdf1 = np.std(f1LR)
    stdPrecision = np.std(precisionLR)
    stdRecall = np.std(recallLR)
    print("Final results  LOGISTIC REGRESSION----------------------------------")
    print("Average Accuracy:", aveAccuracy)
    print("Average F1:", avef1)
    print("Average Precision:", avePrecision)
    print("Average recall", aveRecall)
    print("STD Accuracy:", stdAccuracy)
    print("STD F1:", stdf1)
    print("STD Precision:", stdPrecision)
    print("STD Recall:", stdRecall)
    
    #RESULTS OF SUPPORT VECTOR MACHINE
    aveAccuracy = np.mean(accuracySVM)
    avef1 = np.mean(f1SVM)
    avePrecision = np.mean(precisionSVM)
    aveRecall = np.mean(recallSVM)
    stdAccuracy = np.std(accuracySVM)
    stdf1 = np.std(f1SVM)
    stdPrecision = np.std(precisionSVM)
    stdRecall = np.std(recallSVM)
    print("Final results SUPPORT VECTOR MACHINE----------------------------------")
    print("Average Accuracy:", aveAccuracy)
    print("Average F1:", avef1)
    print("Average Precision:", avePrecision)
    print("Average recall", aveRecall)
    print("STD Accuracy:", stdAccuracy)
    print("STD F1:", stdf1)
    print("STD Precision:", stdPrecision)
    print("STD Recall:", stdRecall)
    
    #RESULTS OF DECISION TREE
    aveAccuracy = np.mean(accuracyDT)
    avef1 = np.mean(f1DT)
    avePrecision = np.mean(precisionDT)
    aveRecall = np.mean(recallDT)
    stdAccuracy = np.std(accuracyDT)
    stdf1 = np.std(f1DT)
    stdPrecision = np.std(precisionDT)
    stdRecall = np.std(recallDT)
    print("Final results DECISION TREE----------------------------------")
    print("Average Accuracy:", aveAccuracy)
    print("Average F1:", avef1)
    print("Average Precision:", avePrecision)
    print("Average recall", aveRecall)
    print("STD Accuracy:", stdAccuracy)
    print("STD F1:", stdf1)
    print("STD Precision:", stdPrecision)
    print("STD Recall:", stdRecall)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1685.17it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1793.70it/s]


[1 1 1 ... 0 0 0]




1600
1600
['when', '17', 'year', 'old', 'beau', 'hall', 'jonathan', 'tucker', 'the', 'virgin', 'suicides', 'gets', 'in', 'a', 'car', 'accident', 'after', 'drinking', 'with', 'his', '30', 'year', 'old', 'companion', 'club', 'owner', 'darby', 'reese', 'josh', 'lucas', 'session', '9', 'his', 'mom', 'margaret', 'tilda', 'swinton', 'the', 'war', 'zone', 'pays', 'reese', 'a', 'visit', 'to', 'convince', 'him', 'to', 'leave', 'her', 'son', 'alone', 'instead', 'reese', 'travels', 'out', 'to', 'their', 'lake', 'tahoe', 'home', 'and', 'lures', 'beau', 'out', 'to', 'the', 'boat', 'house', 'where', 'beau', 'learns', 'that', 'his', 'mother', 'was', 'right', 'about', 'reese', 'the', 'next', 'morning', 'margaret', 'discovers', 'darby', 's', 'dead', 'body', 'on', 'their', 'beach', 'and', 'suspecting', 'the', 'worst', 'weights', 'it', 'down', 'in', 'a', 'distant', 'part', 'of', 'the', 'lake', 'but', 'her', 'troubles', 'are', 'just', 'beginning', 'as', 'she', 'soon', 'learns', 'with', 'the', 'arrival', '

['in', 'essence', 'good', 'will', 'hunting', 'is', 'an', 'ordinary', 'story', 'told', 'well', 'taken', 'as', 'a', 'whole', 'there', 's', 'little', 'that', 's', 'special', 'about', 'this', 'tale', 'it', 'follows', 'a', 'traditional', 'narrative', 'path', 'leaves', 'the', 'audience', 'with', 'a', 'warm', 'fuzzy', 'feeling', 'and', 'never', 'really', 'challenges', 'or', 'surprises', 'us', 'but', 'it', 's', 'intelligently', 'written', 'with', 'dialogue', 'that', 'is', 'occasionally', 'brilliant', 'strongly', 'directed', 'and', 'nicely', 'acted', 'so', 'while', 'good', 'will', 'hunting', 'is', 'far', 'from', 'a', 'late', 'year', 'masterpiece', 'it', 's', 'a', 'worthwhile', 'sample', 'of', 'entertainment', 'like', 'scent', 'of', 'a', 'woman', 'which', 'was', 'released', 'around', 'this', 'time', 'of', 'the', 'season', 'five', 'years', 'ago', 'good', 'will', 'hunting', 'is', 'about', 'the', 'unlikely', 'friendship', 'that', 'develops', 'between', 'a', 'world', 'weary', 'veteran', 'and', 'a', 

AttributeError: 'list' object has no attribute 'lower'