In [1]:
#List of Libraries Imported
import csv                               
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
#machine learning library for support vector machine
from sklearn.pipeline import Pipeline                         #used for training and validating the classifier
from nltk.tokenize import wordpunct_tokenize                  #used for tokenisation
import nltk
nltk.download('punkt')
import numpy as np
from sklearn.metrics import accuracy_score                    #used to find accuracy
from sklearn.metrics import precision_recall_fscore_support   #used to find precision, recall, fscore

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
# load data from the file and append to the rawData
def loadData(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "DOC_ID":  # skip the header
                continue
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
           
def splitData(percentage):
    # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    #to find half of data
    halfOfData = int(len(rawData)/2)
    #to find training samples
    trainingSamples = int((percentage*dataSamples)/2)
    #splitting of trainData
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    #splitting of testData   
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

# Question 1

In [3]:
# Convert line from i/p file into an id/text/label 
def parseReview(reviewLine):
    a='' #test case contains a single string a
    if reviewLine[1]=='__label1__': #assigning a label to reviewline
        a = 'fake review' #characters of the string
    else: 
         'real review'    #characters of the string
    # return a triple containing the identifier of the review (review text, label)     
    return (reviewLine[0], reviewLine[8], a) #(Id, Text, Label)

In [4]:
#Text preprocessing and feature vectorization
#a string of review text which is the input
def preProcess(text):
    # preProcess function which turns review text into a list of tokens (using tokenisation)
    return wordpunct_tokenize(text)

# Question 2

In [5]:
featureDict = {} # A global dict of features
#preprocessed review should return a dictionary that has as its keys the tokens, and as values the weight of those tokens in the preprocessed reviews
#used to validate for loop, for every token
#validate if it is present already in featuredict or dict
def toFeatureVector(tokens):
    Dict = {}
    for token in tokens:
        try:
          i= featureDict[token]
        except KeyError:
          i = len(featureDict) + 1
          featureDict[token] = i
        try:  
           Dict[i] += (1.0/len(tokens))
        except KeyError:
           Dict[i] = (1.0/len(tokens))        
    return Dict

In [6]:
# Training and validation of the classifier
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

# Question 3

In [7]:
# QUESTION 3
#cross validation is done to predict accuracy, precision, recall, fscore
#train a classifier to do a 10-fold cross validation on the training data
def crossValidate(dataset, folds):
    #data set which is sorted based on their class
    shuffle(dataset)
    cv_results = []
    foldSize = int(len(dataset)/folds)
    for i in range(0,len(dataset),foldSize):
        classifier = trainClassifier(dataset[:i]+dataset[foldSize+i:])
        y_pred = predictLabels(dataset[i:i+foldSize],classifier)
        #formulae to predict accuracy
        a = accuracy_score(list(map(lambda d : d[1], dataset[i:i+foldSize])), y_pred)
        #formulae to predict precision, recall, fscore
        (p,r,f,_) = precision_recall_fscore_support(list(map(lambda d : d[1], dataset[i:i+foldSize])), y_pred, average ='weighted')
        #print(a,p,r,f)
        #stores the precision, recall, f1 score, and accuracy of the classifier in a variable cv_results
        cv_results.append((a,p,r,f))
    cv_results = (np.mean(np.array(cv_results),axis=0))
    return cv_results

In [8]:
# Predicting labels of the classifier

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [9]:
# MAIN

# loading reviews from the file
# initialize global lists that will be appended to by the methods below
rawData = []          # the filtered data from the dataset file (21000 samples)
trainData = []        # the pre-processed training data as a percentage of the total dataset ( 80% or 16800 samples)
testData = []         # the pre-processed test data as a percentage of the total dataset (20% or 4200 samples)

# the o/p class
fakeLabel = 'fake review'
realLabel = 'real review'

# references to the data files
reviewPath = 'amazon_reviews.txt'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 

# We split the raw dataset into a set of training data and test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features in dataset after split
print("After split, %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

# QUESTION 3 - Make sure there is a function call here to the
# crossValidate function on the training set to get your results
#10-fold cross validation on the training data
print("Mean of cross-validations (Accuracy, Precision, Recall, F1 score): ", crossValidate(trainData, 10))


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
44726
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Mean of cross-validations (Accuracy, Precision, Recall, F1 score):  [0.65404762 0.65507682 0.65404762 0.65384116]


# Evaluate on test set

In [10]:
# testing on the test set
functions_complete = True                           # set to True once you're happy with your methods for cross validation
if functions_complete:
    print(testData[0])                              # have a look at the first test data instance
    classifier = trainClassifier(trainData)         
    testTrue = [t[1] for t in testData]             # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  #Â classify the test data to get predicted labels
    a = accuracy_score(testTrue, testPred)          #evaluation of accuracy in test data
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluation of p,r,f in test data evaluate
    print("Done training!")
    print("accuracy: ", a)                                            #print accuracy after evaluating on test set
    print("Precision: %f\nRecall: %f\nF1 Score:%f" % finalScores[:3]) #print p,r,f after evaluating on test set

({286: 0.04, 4488: 0.04, 34: 0.04, 218: 0.04, 22660: 0.04, 40: 0.04, 326: 0.04, 50: 0.04, 65: 0.04, 49: 0.04, 13: 0.08, 68: 0.04, 128: 0.04, 688: 0.04, 26: 0.04, 1611: 0.04, 1285: 0.04, 809: 0.04, 11: 0.04, 1829: 0.04, 1811: 0.04, 153: 0.04, 102: 0.04, 7: 0.04}, 'fake review')
Training Classifier...
Done training!
accuracy:  0.6419047619047619
Precision: 0.641913
Recall: 0.641905
F1 Score:0.641900
