In [1]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline

In [2]:
# load data from a file and append it to the rawData


def loadData(path, Text=None):
    with open(path, errors="ignore") as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
        #    print(line)
        #    break
            if line[0] == "DOC_ID":  # skip the header
                continue
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
        


def splitData(percentage):
    # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
        # training data without label array created for cross validation function. Specifically for label prediction of validation fold
        trainData_noLab.append((toFeatureVector(preProcess(Text))))
        # training data label array created for producing accuracy metrics, comparing predicted validation dataset fold labels against the actual labels
        trainLabel.append(Label)
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))
        # test data without label array created for predicting test dataset label
        testData_noLab.append((toFeatureVector(preProcess(Text))))
        # test data label array created for evaluating accuracy metrics
        testLabel.append(Label)

# Question 1

In [3]:
# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    # the following line of code extracts the ID, review text and label string for each line
    return (reviewLine[0], reviewLine[8], reviewLine[1])

In [4]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION
import re

# Input: a string of one review
def preProcess(text):
    # Should return a list of tokens
    # the following code goes through each line of text, separating the words from other characters and tokenising them.
    # the tokens from each line of text are then appended to an array "tokens" 
    text = re.sub(r"(\w)([<>.,;:!?Ã©\"”\)])", r"\1 \2", text)
    text = re.sub(r"([.,;:!<>?Ã©\"“\(])(\w)", r"\1 \2", text)
    #print("tokenising:", text)
    # no other spelling normalization done for now
    tokens = re.split(r"\s+",text)
    tokens = [t.lower() for t in tokens]
    return tokens
        
    


# original solution with for loop. Can be ignored:    
    
# def preProcess(text):
#     # Should return a list of tokens
#     # the following code goes through each line of text, separating the words from other characters and tokenising them.
#     # the tokens from each line of text are then appended to an array "Tokens" 
#     for line in text:
#         line = re.sub(r"(\w)([<>.,;:!?Ã©'\"”\)])", r"\1 \2", line)
#         line = re.sub(r"([.,;:!<>?Ã©'\"“\(])(\w)", r"\1 \2", line)
#         #print("tokenising:", text)
#         # no other spelling normalization done for now
#         tokens = re.split(r"\s+",line)
#         tokens = [t.lower() for t in tokens]
#         return tokens

# Question 2

In [5]:
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    
    # Dictionary created to add new tokens as keys with weights as the value
    v = {}
    
    # for loop adds new token to featureDict whilst increasing the index by one, or using the index of existing token if
    # said token already exists in the dictionary
    for token in tokens:
        try:
            i = featureDict[token]
        except KeyError:
            i = len(featureDict) + 1
            featureDict[token] = i
   # for each line of text, a new token is added with it's weightage in the line as the associated value, or this is increased
   # by it's additional relative weightage if the token already exists within that line of text
        try:
            v[token] = 1
        except KeyError:
            v[token] = 1
    return v



# original nested for loop solution. Can be ignored:

# def toFeatureVector(tokens):
#     # Should return a dictionary containing features as keys, and weights as values
#     # DESCRIBE YOUR METHOD IN WORDS
#     v = {}
#     for line in tokens:
#         for token in line:
#             try:
#                 i = featureDict[token]
#             except KeyError:
#                 i = len(featureDict) + 1
#                 featureDict[token] = i
#             try:
#                 v[token] += (1.0/len(line))
#             except KeyError:
#                 v[token] = (1.0/len(line))
#     return v

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [7]:
if 1 == 1 and 2 == 2 and \
        3 == 3:
    print("hello")

hello


# Question 3

In [23]:
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


# the datashuffle function shuffles the indices for the training data, and all associated training datasets are shuffled
# according to these indices to maintain the relational positions of the indices

def datashuffle(trainData, trainData_noLab, trainLabel):
    index_shuf = list(range(len(trainData)))
    shuffle(index_shuf)
    for i in index_shuf:
        trainData_shuf.append(trainData[i])
        trainData_noLab_shuf.append(trainData_noLab[i])
        trainLabel_shuf.append(trainLabel[i])
                               
    
def crossValidate(trainData_shuf, trainData_noLab_shuf, trainLabel_shuf, folds):
    cv_results = []
    foldSize = int(len(trainData_shuf)/folds)
    
    # for loop iterates through the folds of data in the training data based on foldsize and performs important calculations
    for i in range(0,(len(trainData_shuf)-foldSize),foldSize):
        
        # trainClassifier function is invoked to train the data on the out of fold data from the training dataset
        classifier = trainClassifier(trainData_shuf[0:i]+trainData_shuf[(i+foldSize):len(trainData_shuf)])
        
        # validation dataset for the respective fold is created
        trainData_val = trainData_noLab_shuf[i:(i+foldSize)]
        
        # label is predicted for the respective validation data fold
        label_pred = predictLabels(trainData_val, classifier)
        
        # accuracy metrics are evaluated using the predicted labels against the actual labels for the validation fold
        acc_score = accuracy_score(trainLabel_shuf[i:(i+foldSize)], label_pred)
        prec_score = precision_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average='weighted')
        rec_score = recall_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average='weighted')
        f_score = f1_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average='weighted')
        
        # all accuracy metrics are appended to the cv_results array
        cv_results.append(acc_score)
        cv_results.append(prec_score)
        cv_results.append(rec_score)
        cv_results.append(f_score)
        
    print ("accuracy score =", (cv_results[0]+cv_results[4]+cv_results[8]+cv_results[12]+cv_results[16]+cv_results[20]+cv_results[24]+cv_results[28]+cv_results[32])/9, \
           ", precision score =", (cv_results[1]+cv_results[5]+cv_results[9]+cv_results[13]+cv_results[17]+cv_results[21]+cv_results[25]+cv_results[29]+cv_results[33])/9, \
           ", recall score =", (cv_results[2]+cv_results[6]+cv_results[10]+cv_results[14]+cv_results[18]+cv_results[22]+cv_results[26]+cv_results[30]+cv_results[34])/9, \
            ", F_score =", (cv_results[3]+cv_results[7]+cv_results[11]+cv_results[15]+cv_results[19]+cv_results[23]+cv_results[27]+cv_results[31]+cv_results[35])/9)                   
                               

# further commented code can be ignored:        
        
# def crossValidate(dataset, folds):
#     cv_results = []
#     foldSize = int(len(dataset)/folds)
#     # DESCRIBE YOUR METHOD IN WORDS
#     for i in range(0,(len(dataset)-foldSize),foldSize):
#         #not sure how to implement cross-validation here
#         classifier = trainClassifier(dataset[0:i]+dataset[(i+foldSize):len(dataset)])
#         dataset_val = train_Data_noLab[i:(i+foldSize)]
#         label_pred = predictLabels(dataset_val, classifier)
#         acc_score = accuracy_score(trainLabel[i:(i+foldSize)], label_pred)
#         prec_score = precision_score(trainLabel[i:(i+foldSize)], label_pred, average=None)
#         rec_score = recall_score(trainLabel[i:(i+foldSize)], label_pred, average=None)
#         f_score = f1_score(trainLabel[i:(i+foldSize)], label_pred, average=None)
#         cv_results.append(acc_score)
#         cv_results.append(prec_score)
#         cv_results.append(rec_score)
#         cv_results.append(f_score)
        
        #continue # Replace by code that trains and tests on the 10 folds of data in the dataset
#    return cv_results

# def crossValidate(dataset, folds):
#     dataset_shuf = []
#     train_Data_noLab_shuf = []
#     trainLabel_shuf = []
#     index_shuf = list(range(len(dataset)))
#     shuffle(index_shuf)
    
#     for i in index_shuf:
#         dataset_shuf.append(dataset[i])
#         train_Data_noLab_shuf.append(train_Data_noLab[i])
#         trainLabel_shuf.append(trainLabel[i])
#         foldSize = int(len(dataset)/folds)
#         cv_results = []
#         # DESCRIBE YOUR METHOD IN WORDS
#         for i in range(0,(len(dataset)-foldSize),foldSize):
#         #not sure how to implement cross-validation here
#             classifier = trainClassifier(dataset_shuf[0:i]+dataset_shuf[(i+foldSize):len(dataset_shuf)])
#             dataset_val = train_Data_noLab_shuf[i:(i+foldSize)]
#             label_pred = predictLabels(dataset_val, classifier)
#             acc_score = accuracy_score(trainLabel_shuf[i:(i+foldSize)], label_pred)
#             prec_score = precision_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average=None)
#             rec_score = recall_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average=None)
#             f_score = f1_score(trainLabel_shuf[i:(i+foldSize)], label_pred, average=None)
#             cv_results.append(acc_score)
#             cv_results.append(prec_score)
#             cv_results.append(rec_score)
#             cv_results.append(f_score)
        
#         #continue # Replace by code that trains and tests on the 10 folds of data in the dataset
#         return cv_results

In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(reviewSamples)
    

def predictLabel(reviewSample, classifier):
    return classifier.classify(reviewSample)
    

In [24]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
trainData = []# the pre-processed training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the pre-processed test data as a percentage of the total dataset (currently 20%, or 4200 samples)
trainLabel = []
testLabel = []

# arrays creadted as inputs to the dataShuffle and crossValidate functions

trainData_noLab =[]
testData_noLab =[]

trainData_shuf = []
trainData_noLab_shuf = []
trainLabel_shuf = []

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

# Shuffle all training datasets before cross validating
datashuffle(trainData, trainData_noLab, trainLabel)

# QUESTION 3 - Make sure there is a function call here to the
# crossValidate function on the training set to get your results
crossValidate(trainData_shuf, trainData_noLab_shuf, trainLabel_shuf, 10)

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
43352
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
accuracy score = 0.6117724867724869 , precision score = 0.6121098362529009 , recall score = 0.6117724867724869 , F_score = 0.611650841180592


In [10]:
# check for trainData
trainData

[({'when': 1,
   'least': 1,
   'you': 1,
   'think': 1,
   'so': 1,
   ',': 1,
   'this': 1,
   'product': 1,
   'will': 1,
   'save': 1,
   'the': 1,
   'day': 1,
   '.': 1,
   'just': 1,
   'keep': 1,
   'it': 1,
   'around': 1,
   'in': 1,
   'case': 1,
   'need': 1,
   'for': 1,
   'something': 1},
  '__label1__'),
 ({'lithium': 1,
   'batteries': 1,
   'are': 1,
   'something': 1,
   'new': 1,
   'introduced': 1,
   'in': 1,
   'the': 1,
   'market': 1,
   'there': 1,
   'average': 1,
   'developing': 1,
   'cost': 1,
   'is': 1,
   'relatively': 1,
   'high': 1,
   'but': 1,
   'stallion': 1,
   "doesn't": 1,
   'compromise': 1,
   'on': 1,
   'quality': 1,
   'and': 1,
   'provides': 1,
   'us': 1,
   'with': 1,
   'best': 1,
   'at': 1,
   'a': 1,
   'low': 1,
   '.<': 1,
   'br': 1,
   '/>': 1,
   'so': 1,
   'many': 1,
   'built': 1,
   'technical': 1,
   'assistants': 1,
   'that': 1,
   'act': 1,
   'like': 1,
   'sensor': 1,
   'their': 1,
   'particular': 1,
   'fort': 1

In [11]:
# check for featureDict
featureDict

{'when': 1,
 'least': 2,
 'you': 3,
 'think': 4,
 'so': 5,
 ',': 6,
 'this': 7,
 'product': 8,
 'will': 9,
 'save': 10,
 'the': 11,
 'day': 12,
 '.': 13,
 'just': 14,
 'keep': 15,
 'it': 16,
 'around': 17,
 'in': 18,
 'case': 19,
 'need': 20,
 'for': 21,
 'something': 22,
 'lithium': 23,
 'batteries': 24,
 'are': 25,
 'new': 26,
 'introduced': 27,
 'market': 28,
 'there': 29,
 'average': 30,
 'developing': 31,
 'cost': 32,
 'is': 33,
 'relatively': 34,
 'high': 35,
 'but': 36,
 'stallion': 37,
 "doesn't": 38,
 'compromise': 39,
 'on': 40,
 'quality': 41,
 'and': 42,
 'provides': 43,
 'us': 44,
 'with': 45,
 'best': 46,
 'at': 47,
 'a': 48,
 'low': 49,
 '.<': 50,
 'br': 51,
 '/>': 52,
 'many': 53,
 'built': 54,
 'technical': 55,
 'assistants': 56,
 'that': 57,
 'act': 58,
 'like': 59,
 'sensor': 60,
 'their': 61,
 'particular': 62,
 'fort': 63,
 'ã©.': 64,
 'battery': 65,
 'keeps': 66,
 'my': 67,
 'phone': 68,
 'charged': 69,
 'up': 70,
 'works': 71,
 'every': 72,
 'voltage': 73,
 'neve

# Evaluate on test set

In [25]:
from sklearn.metrics import precision_recall_fscore_support

# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0])   # have a look at the first test data instance
    classifier = trainClassifier(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData_noLab, classifier)  # classify the test data to get predicted labels
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])

({'this': 1, 'assortment': 1, 'is': 1, 'really': 1, "hershey's": 1, 'at': 1, 'their': 1, 'best': 1, '.': 1, 'the': 1, 'little': 1, 'ones': 1, 'are': 1, 'always': 1, 'excited': 1, 'whenever': 1, 'holidays': 1, 'come': 1, 'because': 1, 'of': 1}, '__label1__')
Training Classifier...
Done training!
Precision: 0.597393
Recall: 0.597381
F Score:0.597369


### As we can see above, the accuracy metrics for the model's predictive capabilities are all ~0.6. This will serve as a baseline for comparison with all further evolutions.