In [15]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline

### In the following cell, the various features from 'amazon_reviews.txt' will be added to the feature vector and appended to the training and test datasets.

In [16]:
# load data from a file and append it to the rawData


def loadData(path, Text=None):
    with open(path, errors="ignore") as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
        #    print(line)
        #    break
            if line[0] == "DOC_ID":  # skip the header
                continue
            (Id, Rating, Verified_Purchase, Review_Title, Text, Label) = parseReview(line)
            rawData.append((Id, Rating, Verified_Purchase, Review_Title, Text, Label))
            
def splitData(percentage):
    # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Rating, Verified_Purchase, Review_Title, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(Rating, Verified_Purchase, (preProcess(Review_Title)), (preProcess(Text))), Label))
        # training data without label array created for cross validation function. Specifically for label prediction of validation fold
        trainData_noLab.append((toFeatureVector(Rating, Verified_Purchase, (preProcess(Review_Title)), (preProcess(Text)))))
        # training data label array created for producing accuracy metrics, comparing predicted validation dataset fold labels against the actual labels
        trainLabel.append(Label)
    for (_, Rating, Verified_Purchase, Review_Title, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(Rating, Verified_Purchase, (preProcess(Review_Title)), (preProcess(Text))), Label))
        # test data without label array created for predicting test dataset label
        testData_noLab.append((toFeatureVector(Rating, Verified_Purchase, (preProcess(Review_Title)), (preProcess(Text)))))
        # test data label array created for evaluating accuracy metrics
        testLabel.append(Label)            
        

# can ignore beyond this point:
# def splitData(percentage):
#     # A method to split the data between trainData and testData 
#     dataSamples = len(rawData)
#     halfOfData = int(len(rawData)/2)
#     trainingSamples = int((percentage*dataSamples)/2)
#     for (_, Rating, Verified_Purchase, Review_Title, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
#         trainData.append((toFeatureVector(Rating), toFeatureVector(Verified_Purchase), toFeatureVector(preProcess(Review_Title)), toFeatureVector(preProcess(Text)), Label))
#         # training data without label array created for cross validation function. Specifically for label prediction of validation fold
#         trainData_noLab.append((toFeatureVector(Rating), toFeatureVector(Verified_Purchase), toFeatureVector(preProcess(Review_Title)), toFeatureVector(preProcess(Text))))
#         # training data label array created for producing accuracy metrics, comparing predicted validation dataset fold labels against the actual labels
#         trainLabel.append(Label)
#     for (_, Rating, Verified_Purchase, Review_Title, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
#         testData.append((toFeatureVector(Rating), toFeatureVector(Verified_Purchase), toFeatureVector(preProcess(Review_Title)), toFeatureVector(preProcess(Text)), Label))
#         # test data without label array created for predicting test dataset label
#         testData_noLab.append((toFeatureVector(Rating), toFeatureVector(Verified_Purchase), toFeatureVector(preProcess(Review_Title)), toFeatureVector(preProcess(Text))))
#         # test data label array created for evaluating accuracy metrics
#         testLabel.append(Label)

# Question 5

In [17]:
# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    # the following line of code extracts the ID, review text and label string for each line
    return (reviewLine[0], str(reviewLine[2]), reviewLine[3], reviewLine[7], reviewLine[8], reviewLine[1])

### We will filter custom stop words for the different ratings (1,2,3,4,5) so that they are not considered in Review_Title or the Text feature. They will be considered only for the ratings feature.

In [18]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION
import re

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Input: a string of one review
def preProcess(text):
    # Should return a list of tokens
    # the following code goes through each line of text, separating the words from other characters and tokenising them.
    # the tokens from each line of text are then appended to an array "tokens" 
    text = re.sub(r"(\w)([<>.,;:!?Ã©\"”\)])", r"\1 \2", text)
    text = re.sub(r"([.,;:!<>?Ã©\"“\(])(\w)", r"\1 \2", text)
    #print("tokenising:", text)
    # no other spelling normalization done for now
    tokens = re.split(r"\s+",text)
    tokens = [t.lower() for t in tokens]
    # stop words included for ratings so that they are not considered in other textual features
    stop_words = ['1','2','3','4','5']
    filtered_tokens = [t for t in tokens if not t.lower() in stop_words]
    filtered_tokens = []
    for t in tokens:
        if t not in stop_words:
            filtered_tokens.append(t)
    return filtered_tokens
        
    
# # Input: a string of one review
# def preProcess(text):
#     # Should return a list of tokens
#     # the following code goes through each line of text, separating the words from other characters and tokenising them.
#     # the tokens from each line of text are then appended to an array "tokens" 
#     text = re.sub(r"(\w)([<>.,;:!?Ã©\"”\)])", r"\1 \2", text)
#     text = re.sub(r"([.,;:!<>?Ã©\"“\(])(\w)", r"\1 \2", text)
#     #print("tokenising:", text)
#     # no other spelling normalization done for now
#     tokens = re.split(r"\s+",text)
#     tokens = [t.lower() for t in tokens]
#     return tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amaan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### A feature dictionary and token featureset is created for all the features to be included in the model.

In [19]:
featureDict = {} # A global dictionary of features

def toFeatureVector(Rating, Verified_Purchase, Review_Title, Text):
    # Should return a dictionary containing features as keys, and weights as values
    
    # Dictionary created to add new tokens as keys with weights as the value
    v = {}
    
    # for loop adds new token to featureDict whilst increasing the index by one, or using the index of existing token if
    # said token already exists in the dictionary
    for token in Rating:
        try:
            i = featureDict[token]
        except KeyError:
            i = len(featureDict) + 1
            featureDict[token] = i
   # for each rating, a new token is added with a value of 1
        try:
            v[token] += 1
        except KeyError:
            v[token] = 1
    for token in Verified_Purchase:
        try:
            i = featureDict[token]
        except KeyError:
            i = len(featureDict) + 1
            featureDict[token] = i
   # for each value of Verified_Purchase (Y/N), a new token is added with a value of 1
        try:
            v[token] += 1
        except KeyError:
            v[token] = 1
    for token in Review_Title:
        try:
            i = featureDict[token]
        except KeyError:
            i = len(featureDict) + 1
            featureDict[token] = i
    # for each line of text, a new token is added with it's weightage in the line as the associated value, or this is increased
   # by it's additional relative weightage if the token already exists within that line of text
        try:
            v[token] += 1/(len(Review_Title))
        except KeyError:
            v[token] = 1/(len(Review_Title))
    for token in Text:
        try:
            i = featureDict[token]
        except KeyError:
            i = len(featureDict) + 1
            featureDict[token] = i
   # for each line of text, a new token is added with it's weightage in the line as the associated value, or this is increased
   # by it's additional relative weightage if the token already exists within that line of text
        try:
            v[token] += 1/(len(Text))
        except KeyError:
            v[token] = 1/(len(Text))
    return v

In [20]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [2]:
if 1 == 1 and 2 == 2 and \
        3 == 3:
    print("hello")

hello


In [25]:
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


# the datashuffle function shuffles the indices for the training data, and all associated training datasets are shuffled
# according to these indices to maintain the relational aspect of the indices

def datashuffle(trainData, trainData_noLab, trainLabel):
    index_shuf = list(range(len(trainData)))
    shuffle(index_shuf)
    for i in index_shuf:
        trainData_shuf.append(trainData[i])
        trainData_noLab_shuf.append(trainData_noLab[i])
        trainLabel_shuf.append(trainLabel[i])
                               
    
def crossValidate(trainData_shuf, trainData_noLab_shuf, trainLabel_shuf, folds):
    cv_results = []
    foldSize = int(len(trainData_shuf)/folds)
    
    # for loop iterates through the folds of data in the training data based on foldsize and performs important calculations
    for i in range(0,(len(trainData_shuf)-foldSize),foldSize):
        
        # trainClassifier function is invoked to train the data on the out of fold data from the training dataset
        classifier = trainClassifier(trainData_shuf[0:i]+trainData_shuf[(i+foldSize):len(trainData_shuf)])
        
        # validation dataset for the respective fold is created
        trainData_val = trainData_noLab_shuf[i:(i+foldSize)]
        
        # label is predicted for the respective validation data fold
        label_pred = predictLabels(trainData_val, classifier)
        
        # accuracy metrics are evaluated using the predicted labels against the actual labels for the validation fold
        acc_score = accuracy_score(trainLabel_shuf[i:(i+foldSize)], label_pred)
        prec_score = precision_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average='weighted')
        rec_score = recall_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average='weighted')
        f_score = f1_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average='weighted')
        
        # all accuracy metrics are appended to the cv_results array
        cv_results.append(acc_score)
        cv_results.append(prec_score)
        cv_results.append(rec_score)
        cv_results.append(f_score)
        
    print ("accuracy score =", (cv_results[0]+cv_results[4]+cv_results[8]+cv_results[12]+cv_results[16]+cv_results[20]+cv_results[24]+cv_results[28]+cv_results[32])/9, \
           ", precision score =", (cv_results[1]+cv_results[5]+cv_results[9]+cv_results[13]+cv_results[17]+cv_results[21]+cv_results[25]+cv_results[29]+cv_results[33])/9, \
           ", recall score =", (cv_results[2]+cv_results[6]+cv_results[10]+cv_results[14]+cv_results[18]+cv_results[22]+cv_results[26]+cv_results[30]+cv_results[34])/9, \
            ", F_score =", (cv_results[3]+cv_results[7]+cv_results[11]+cv_results[15]+cv_results[19]+cv_results[23]+cv_results[27]+cv_results[31]+cv_results[35])/9)                                          
                               
                               

# further commented code can be ignored:        
        
# def crossValidate(dataset, folds):
#     cv_results = []
#     foldSize = int(len(dataset)/folds)
#     # DESCRIBE YOUR METHOD IN WORDS
#     for i in range(0,(len(dataset)-foldSize),foldSize):
#         #not sure how to implement cross-validation here
#         classifier = trainClassifier(dataset[0:i]+dataset[(i+foldSize):len(dataset)])
#         dataset_val = train_Data_noLab[i:(i+foldSize)]
#         label_pred = predictLabels(dataset_val, classifier)
#         acc_score = accuracy_score(trainLabel[i:(i+foldSize)], label_pred)
#         prec_score = precision_score(trainLabel[i:(i+foldSize)], label_pred, average=None)
#         rec_score = recall_score(trainLabel[i:(i+foldSize)], label_pred, average=None)
#         f_score = f1_score(trainLabel[i:(i+foldSize)], label_pred, average=None)
#         cv_results.append(acc_score)
#         cv_results.append(prec_score)
#         cv_results.append(rec_score)
#         cv_results.append(f_score)
        
        #continue # Replace by code that trains and tests on the 10 folds of data in the dataset
#    return cv_results

# def crossValidate(dataset, folds):
#     dataset_shuf = []
#     train_Data_noLab_shuf = []
#     trainLabel_shuf = []
#     index_shuf = list(range(len(dataset)))
#     shuffle(index_shuf)
    
#     for i in index_shuf:
#         dataset_shuf.append(dataset[i])
#         train_Data_noLab_shuf.append(train_Data_noLab[i])
#         trainLabel_shuf.append(trainLabel[i])
#         foldSize = int(len(dataset)/folds)
#         cv_results = []
#         # DESCRIBE YOUR METHOD IN WORDS
#         for i in range(0,(len(dataset)-foldSize),foldSize):
#         #not sure how to implement cross-validation here
#             classifier = trainClassifier(dataset_shuf[0:i]+dataset_shuf[(i+foldSize):len(dataset_shuf)])
#             dataset_val = train_Data_noLab_shuf[i:(i+foldSize)]
#             label_pred = predictLabels(dataset_val, classifier)
#             acc_score = accuracy_score(trainLabel_shuf[i:(i+foldSize)], label_pred)
#             prec_score = precision_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average=None)
#             rec_score = recall_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average=None)
#             f_score = f1_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average=None)
#             cv_results.append(acc_score)
#             cv_results.append(prec_score)
#             cv_results.append(rec_score)
#             cv_results.append(f_score)
        
#         #continue # Replace by code that trains and tests on the 10 folds of data in the dataset
#         return cv_results

In [22]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(reviewSamples)
    #return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(reviewSample)
    #return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [26]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
trainData = []# the pre-processed training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the pre-processed test data as a percentage of the total dataset (currently 20%, or 4200 samples)
trainLabel = []
testLabel = []

# arrays creadted as inputs to the dataShuffle and crossValidate functions

trainData_noLab =[]
testData_noLab =[]

trainData_shuf = []
trainData_noLab_shuf = []
trainLabel_shuf = []

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

# Shuffle all training datasets before cross validating
datashuffle(trainData, trainData_noLab, trainLabel)

# QUESTION 3 - Make sure there is a function call here to the
# crossValidate function on the training set to get your results
crossValidate(trainData_shuf, trainData_noLab_shuf, trainLabel_shuf, 10)

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
44712
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
accuracy score = 0.7815476190476189 , precision score = 0.7830798345944237 , recall score = 0.7815476190476189 , F_score = 0.7813137644312936


In [10]:
# check for trainData
trainData

[({'4': 1,
   'N': 1,
   'useful': 1.0,
   'when': 0.038461538461538464,
   'least': 0.038461538461538464,
   'you': 0.07692307692307693,
   'think': 0.038461538461538464,
   'so': 0.038461538461538464,
   ',': 0.038461538461538464,
   'this': 0.038461538461538464,
   'product': 0.038461538461538464,
   'will': 0.038461538461538464,
   'save': 0.038461538461538464,
   'the': 0.038461538461538464,
   'day': 0.038461538461538464,
   '.': 0.07692307692307693,
   'just': 0.07692307692307693,
   'keep': 0.038461538461538464,
   'it': 0.07692307692307693,
   'around': 0.038461538461538464,
   'in': 0.038461538461538464,
   'case': 0.038461538461538464,
   'need': 0.038461538461538464,
   'for': 0.038461538461538464,
   'something': 0.038461538461538464},
  '__label1__'),
 ({'4': 1,
   'Y': 1,
   'new': 0.2635135135135135,
   'era': 0.25,
   'for': 0.25,
   'batteries': 0.2635135135135135,
   'lithium': 0.013513513513513514,
   'are': 0.02702702702702703,
   'something': 0.013513513513513514,

# Evaluate on test set

In [24]:
from sklearn.metrics import precision_recall_fscore_support

# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0])   # have a look at the first test data instance
    classifier = trainClassifier(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData_noLab, classifier)  # classify the test data to get predicted labels
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])

({'5': 1, 'N': 1, "hershey's": 0.2934782608695652, 'at': 0.2934782608695652, 'their': 0.2934782608695652, 'best': 0.2934782608695652, 'this': 0.08695652173913043, 'assortment': 0.043478260869565216, 'is': 0.043478260869565216, 'really': 0.043478260869565216, '.': 0.08695652173913043, 'the': 0.08695652173913043, 'little': 0.043478260869565216, 'ones': 0.043478260869565216, 'are': 0.043478260869565216, 'always': 0.043478260869565216, 'excited': 0.043478260869565216, 'whenever': 0.043478260869565216, 'holidays': 0.043478260869565216, 'come': 0.043478260869565216, 'because': 0.043478260869565216, 'of': 0.043478260869565216}, '__label1__')
Training Classifier...
Done training!
Precision: 0.803369
Recall: 0.799762
F Score:0.799165


### As we can see from the above accuracy metrics, including the additional metadata features in the featureset has significantly improved the accuracy scores. These additional features have aided the model's predictive capabilities for identifying the correct target label (real or fake).