# Question 4

In [2]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline

In [3]:
from collections import Counter
from math import log
#LOG_BASE = 2 # all the way through here we will use log base 2

In [4]:
# load data from a file and append it to the rawData


def loadData(path, Text=None):
    with open(path, errors="ignore") as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
        #    print(line)
        #    break
            if line[0] == "DOC_ID":  # skip the header
                continue
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
        


def splitData(percentage):
    # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((bigram(tokenize_text(Text,2)),Label))
        # training data without label array created for cross validation function. Specifically for label prediction of validation fold
        trainData_noLab.append((bigram(tokenize_text(Text,2))))
        # training data label array created for producing accuracy metrics, comparing predicted validation dataset fold labels against the actual labels
        trainLabel.append(Label)
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((bigram(tokenize_text(Text,2)),Label))
        # test data without label array created for predicting test dataset label
        testData_noLab.append((bigram(tokenize_text(Text,2))))
        # test data label array created for evaluating accuracy metrics
        testLabel.append(Label)

In [5]:
# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    # the following line of code extracts the ID, review text and label string for each line
    return (reviewLine[0], reviewLine[8], reviewLine[1])

### The following functions will be used to tokenize the text and glue them together to form n-grams of a given order. The purpose of this is that the inclusion of context words could help to improve the resultant model's predictive capabilities of identifying the correct target label (real or fake).

In [6]:
# function for creating n-grams
def glue_tokens(tokens, order):
    """param: order is the order of the language model
        (1 = unigram, 2 = bigram, 3 =trigram etc.)
    """
    return " ".join(tokens)

# function for tokenizing text
def tokenize_text(text, order):
    """Returns a list of tokens with the correct numbers of initial
    and end tags
    :sentence: a string of text
    :param: order is the order of the language model
        (1 = unigram, 2 = bigram, 3 =trigram etc.)
    """
    tokens = text.split()
    tokens = [['<s>'] * (order-1) + tokens + ['</s>']]
    return tokens

### The following function creates bigrams from the tokenized text. The local "order" variable can be changed to create n-grams of differing size if required.

In [8]:
# First get the counts from the training corpus for bigrams without smoothing

def bigram(tokens):
    #bigrams = Counter() # a weight for how many times a given bigram sequence w_i-1,w_i occurs
    #bigram_context = Counter() # a weight for how many times each word is used as a context word w_i-1 (so will include the start symbol)
    bigrams = {}
    bigram_context = {}
    order = 2
    for t in tokens:
        for i in range(order - 1, len(t)):
            context = t[i-order+1:i]
            target = t[i]
            ngram = context + [target]
            try:
                bigrams[glue_tokens(ngram, order)] += (1/len(t))
            except:
                bigrams[glue_tokens(ngram, order)] = (1/len(t))
            try:
                bigram_context[glue_tokens(context, 1)] += (1/len(t))
            except:
                bigram_context[glue_tokens(context, 1)] = (1/len(t))
    return bigrams
    


In [9]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC(C=2))])
    return SklearnClassifier(pipeline).train(trainData)

In [15]:
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


# the datashuffle function shuffles the indices for the training data, and all associated training datasets are shuffled
# according to these indices to maintain the relational aspect of the indices

def datashuffle(trainData, trainData_noLab, trainLabel):
    index_shuf = list(range(len(trainData)))
    shuffle(index_shuf)
    for i in index_shuf:
        trainData_shuf.append(trainData[i])
        trainData_noLab_shuf.append(trainData_noLab[i])
        trainLabel_shuf.append(trainLabel[i])
                               
    
def crossValidate(trainData_shuf, trainData_noLab_shuf, trainLabel_shuf, folds):
    cv_results = []
    foldSize = int(len(trainData_shuf)/folds)
    
    # for loop iterates through the folds of data in the training data based on foldsize and performs important calculations
    for i in range(0,(len(trainData_shuf)-foldSize),foldSize):
        
        # trainClassifier function is invoked to train the data on the out of fold data from the training dataset
        classifier = trainClassifier(trainData_shuf[0:i]+trainData_shuf[(i+foldSize):len(trainData_shuf)])
        
        # validation dataset for the respective fold is created
        trainData_val = trainData_noLab_shuf[i:(i+foldSize)]
        
        # label is predicted for the respective validation data fold
        label_pred = predictLabels(trainData_val, classifier)
        
        # accuracy metrics are evaluated using the predicted labels against the actual labels for the validation fold
        acc_score = accuracy_score(trainLabel_shuf[i:(i+foldSize)], label_pred)
        prec_score = precision_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average='weighted')
        rec_score = recall_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average='weighted')
        f_score = f1_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average='weighted')
        
        # all accuracy metrics are appended to the cv_results array
        cv_results.append(acc_score)
        cv_results.append(prec_score)
        cv_results.append(rec_score)
        cv_results.append(f_score)
        
    print ("accuracy score =", (cv_results[0]+cv_results[4]+cv_results[8]+cv_results[12]+cv_results[16]+cv_results[20]+cv_results[24]+cv_results[28]+cv_results[32])/9, \
           ", precision score =", (cv_results[1]+cv_results[5]+cv_results[9]+cv_results[13]+cv_results[17]+cv_results[21]+cv_results[25]+cv_results[29]+cv_results[33])/9, \
           ", recall score =", (cv_results[2]+cv_results[6]+cv_results[10]+cv_results[14]+cv_results[18]+cv_results[22]+cv_results[26]+cv_results[30]+cv_results[34])/9, \
            ", F_score =", (cv_results[3]+cv_results[7]+cv_results[11]+cv_results[15]+cv_results[19]+cv_results[23]+cv_results[27]+cv_results[31]+cv_results[35])/9)                                          
                               
                               

# further commented code can be ignored:        
        
# def crossValidate(dataset, folds):
#     cv_results = []
#     foldSize = int(len(dataset)/folds)
#     # DESCRIBE YOUR METHOD IN WORDS
#     for i in range(0,(len(dataset)-foldSize),foldSize):
#         #not sure how to implement cross-validation here
#         classifier = trainClassifier(dataset[0:i]+dataset[(i+foldSize):len(dataset)])
#         dataset_val = train_Data_noLab[i:(i+foldSize)]
#         label_pred = predictLabels(dataset_val, classifier)
#         acc_score = accuracy_score(trainLabel[i:(i+foldSize)], label_pred)
#         prec_score = precision_score(trainLabel[i:(i+foldSize)], label_pred, average=None)
#         rec_score = recall_score(trainLabel[i:(i+foldSize)], label_pred, average=None)
#         f_score = f1_score(trainLabel[i:(i+foldSize)], label_pred, average=None)
#         cv_results.append(acc_score)
#         cv_results.append(prec_score)
#         cv_results.append(rec_score)
#         cv_results.append(f_score)
        
        #continue # Replace by code that trains and tests on the 10 folds of data in the dataset
#    return cv_results

# def crossValidate(dataset, folds):
#     dataset_shuf = []
#     train_Data_noLab_shuf = []
#     trainLabel_shuf = []
#     index_shuf = list(range(len(dataset)))
#     shuffle(index_shuf)
    
#     for i in index_shuf:
#         dataset_shuf.append(dataset[i])
#         train_Data_noLab_shuf.append(train_Data_noLab[i])
#         trainLabel_shuf.append(trainLabel[i])
#         foldSize = int(len(dataset)/folds)
#         cv_results = []
#         # DESCRIBE YOUR METHOD IN WORDS
#         for i in range(0,(len(dataset)-foldSize),foldSize):
#         #not sure how to implement cross-validation here
#             classifier = trainClassifier(dataset_shuf[0:i]+dataset_shuf[(i+foldSize):len(dataset_shuf)])
#             dataset_val = train_Data_noLab_shuf[i:(i+foldSize)]
#             label_pred = predictLabels(dataset_val, classifier)
#             acc_score = accuracy_score(trainLabel_shuf[i:(i+foldSize)], label_pred)
#             prec_score = precision_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average=None)
#             rec_score = recall_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average=None)
#             f_score = f1_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average=None)
#             cv_results.append(acc_score)
#             cv_results.append(prec_score)
#             cv_results.append(rec_score)
#             cv_results.append(f_score)
        
#         #continue # Replace by code that trains and tests on the 10 folds of data in the dataset
#         return cv_results

In [11]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(reviewSamples)
    #return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(reviewSample)
    #return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [16]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
trainData = []# the pre-processed training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the pre-processed test data as a percentage of the total dataset (currently 20%, or 4200 samples)
trainLabel = []
testLabel = []

# arrays creadted as inputs to the dataShuffle and crossValidate functions

trainData_noLab =[]
testData_noLab =[]

trainData_shuf = []
trainData_noLab_shuf = []
trainLabel_shuf = []

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData))

# Shuffle all training datasets before cross validating
datashuffle(trainData, trainData_noLab, trainLabel)

# QUESTION 3 - Make sure there is a function call here to the
# crossValidate function on the training set to get your results
crossValidate(trainData_shuf, trainData_noLab_shuf, trainLabel_shuf, 10)

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 21000 rawData, 16800 trainData, 4200 testData Training Samples:  16800
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
accuracy score = 0.6248677248677248 , precision score = 0.6267273519889096 , recall score = 0.6248677248677248 , F_score = 0.6235955993663335


In [13]:
trainData

[({'<s> When': 0.04,
   'When least': 0.04,
   'least you': 0.04,
   'you think': 0.04,
   'think so,': 0.04,
   'so, this': 0.04,
   'this product': 0.04,
   'product will': 0.04,
   'will save': 0.04,
   'save the': 0.04,
   'the day.': 0.04,
   'day. Just': 0.04,
   'Just keep': 0.04,
   'keep it': 0.04,
   'it around': 0.04,
   'around just': 0.04,
   'just in': 0.04,
   'in case': 0.04,
   'case you': 0.04,
   'you need': 0.04,
   'need it': 0.04,
   'it for': 0.04,
   'for something.': 0.04,
   'something. </s>': 0.04},
  '__label1__'),
 ({'<s> Lithium': 0.014084507042253521,
   'Lithium batteries': 0.014084507042253521,
   'batteries are': 0.014084507042253521,
   'are something': 0.014084507042253521,
   'something new': 0.014084507042253521,
   'new introduced': 0.014084507042253521,
   'introduced in': 0.014084507042253521,
   'in the': 0.014084507042253521,
   'the market': 0.014084507042253521,
   'market there': 0.014084507042253521,
   'there average': 0.01408450704225352

# Evaluate on test set

In [14]:
from sklearn.metrics import precision_recall_fscore_support

# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0])   # have a look at the first test data instance
    classifier = trainClassifier(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData_noLab, classifier)  # classify the test data to get predicted labels
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])

({'<s> This': 0.043478260869565216, 'This assortment': 0.043478260869565216, 'assortment is': 0.043478260869565216, 'is really': 0.043478260869565216, "really Hershey's": 0.043478260869565216, "Hershey's at": 0.043478260869565216, 'at their': 0.043478260869565216, 'their best.': 0.043478260869565216, 'best. The': 0.043478260869565216, 'The little': 0.043478260869565216, 'little ones': 0.043478260869565216, 'ones are': 0.043478260869565216, 'are always': 0.043478260869565216, 'always excited': 0.043478260869565216, 'excited whenever': 0.043478260869565216, 'whenever the': 0.043478260869565216, 'the holidays': 0.043478260869565216, 'holidays come': 0.043478260869565216, 'come because': 0.043478260869565216, 'because of': 0.043478260869565216, 'of this.': 0.043478260869565216, 'this. </s>': 0.043478260869565216}, '__label1__')
Training Classifier...
Done training!
Precision: 0.617793
Recall: 0.617143
F Score:0.616614


### As we can see above, in this case using bigrams instead of unigrams has caused a slight reduction in accuracy metrics. The bigram function has also been run with an order of 3, with the accuracy metrics of the trigram modelled featureset reducing slightly further.