In [1]:
#list of libraries imported
import csv                              
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
#machine learning library for support vector machine
from sklearn.pipeline import Pipeline                           #used for training and validating the classifier
from sklearn.metrics import accuracy_score                      #used to find accuracy
from sklearn.metrics import precision_recall_fscore_support     #used to find precision, recall, fscore
import nltk
import numpy as np                                              
import string
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords                               #used for preprocessing
from nltk.stem import WordNetLemmatizer                         #used for preprocessing

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):  
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader)
        for line in reader:
          #used product_ID, product_title, review_title as extra features
            (Id, product_ID, product_title, review_title, Text, Label) = parseReview(line)
            rawData.append((Id, product_ID, product_title, review_title, Text, Label)) #rawData.append((Id, preProcess(Text), Label))
            
        
def splitData(percentage):
    # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    #to find half of data
    halfOfData = int(len(rawData)/2)
    #to find training samples
    trainingSamples = int((percentage*dataSamples)/2)
    #splitting of trainData
    for (_, product_ID, product_title, review_title, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(product_ID, product_title, review_title, preProcess(Text)),Label))
    #splitting of testData    
    for (_, product_ID, product_title, review_title, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(product_ID, product_title, review_title, preProcess(Text)),Label))

# Question 1

In [3]:
# Convert line from i/p file into an id/text/label
def parseReview(reviewLine):
    a='' #test case contains a single string a
    if reviewLine[1]=='__label1__':#assigning a label to reviewline
        a = 'fake review' #characters of the string
    else: 
        'real review'     #characters of the string
    return (reviewLine[0], reviewLine[2], reviewLine[3],reviewLine[4], reviewLine[8], a)

In [4]:
#Text preprocessing and feature vectorization
#improving preprocessing by using lemmatization , bigrams, stopwords  
#string of review text with is the input
table = str.maketrans({key: None for key in string.punctuation})
def preProcess(text):
    # Should return a list of tokens using lemmatizer
    lemmatizer = WordNetLemmatizer()
    #returns an actual word of the language, it is used where it is necessary to get valid words
    filtered_tokens=[]
    lemmatized_tokens = []
    #setting stopwords
    stop_words = set(stopwords.words('english'))
    #filter out useless data
    text = text.translate(table)
    for w in text.split(" "):
        if w not in stop_words:
            lemmatized_tokens.append(lemmatizer.lemmatize(w.lower()))
            #generate such word pairs from the existing sentence maintain their current sequences
            #filter tokens using bigrams
        filtered_tokens = [' '.join(l) for l in nltk.bigrams(lemmatized_tokens)] + lemmatized_tokens
    return filtered_tokens

# Question 2

In [5]:
featureDict = {} # A global dict of features
#preprocessed review should return a dictionary that has as its keys the tokens, and as values the weight of those tokens in the preprocessed reviews
#used to validate for loop, for every token
#validate if it is present already in featuredict or dict
#using product_ID, product_title, review_title as extra features from data set

def toFeatureVector(product_ID, product_title, review_title, tokens):
    Dict = {}
    
#for product_ID

    for PI in product_ID:
        try:
          i= featureDict[PI]
        except KeyError:
          i = len(featureDict) + 1
          featureDict[PI] = i
        try:  
           Dict[i] += (1.0/len(product_ID))
        except KeyError:
           Dict[i] = (1.0/len(product_ID))  
   
#for product_title

    for PT in product_title:
        try:
          i= featureDict[PT]
        except KeyError:
          i = len(featureDict) + 1
          featureDict[PT] = i
        try:  
           Dict[i] += (1.0/len(product_title))
        except KeyError:
           Dict[i] = (1.0/len(product_title))  
    
#for review_title

    
    for RT in review_title:
        try:
          i= featureDict[RT]
        except KeyError:
          i = len(featureDict) + 1
          featureDict[RT] = i
        try:  
           Dict[i] += (1.0/len(review_title))
        except KeyError:
           Dict[i] = (1.0/len(review_title))  
                 
#for Text        

    for token in tokens:
        try:
          i= featureDict[token]
        except KeyError:
          i = len(featureDict) + 1
          featureDict[token] = i
        try:  
           Dict[i] += (1.0/len(tokens))
        except KeyError:
           Dict[i] = (1.0/len(tokens))  
    
    return Dict

In [6]:
# Training and validation of the classifier
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

Question 3




In [7]:
# QUESTION 3
#cross validation is done to predict accuracy, precision, recall, fscore
#train a classifier to do a 10-fold cross validation on the training data
def crossValidate(dataset, folds):
    #data set which is sorted base on their class
    shuffle(dataset)
    cv_results = []
    foldSize = int(len(dataset)/folds)
    for i in range(0,len(dataset),foldSize):
        classifier = trainClassifier(dataset[:i]+dataset[foldSize+i:])
        pred = predictLabels(dataset[i:i+foldSize],classifier)
        #formulae to predict accuracy
        a = accuracy_score(list(map(lambda d : d[1], dataset[i:i+foldSize])), pred)
        #formulae to predict precision, recall, fscore
        (p,r,f,_) = precision_recall_fscore_support(list(map(lambda d : d[1], dataset[i:i+foldSize])), pred, average ='weighted')
        #print(a,p,r,f)
        #stores the precision, recall, f1 score, and accuracy of the classifier in a variable cv_results
        cv_results.append((a,p,r,f))
    cv_results = (np.mean(np.array(cv_results),axis=0))
    return cv_results

In [8]:
# Predicting labels of the classifier

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [9]:
# MAIN

# loading reviews from the file
# initialize global lists that will be appended to by the methods below
rawData = []          # the filtered data from the dataset file (21000 samples)
trainData = []        # the training data as a percentage of the total dataset (80% or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (20% or 4200 samples)

# the o/p class
fakeLabel = 'fake review'
realLabel = 'real review'

# references to the data files
reviewPath = 'amazon_reviews.txt'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
# We split the raw dataset into a set of training data and test data (80/20)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features in dataset after split
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')
# QUESTION 3 - Make sure there is a function call here to the
# crossValidate function on the training set to get your results(a,p,r,f)
#10-fold cross validation on the training data
print("Mean of cross-validations (Accuracy, Precision, Recall, F1 score): ", crossValidate(trainData, 10))
#using metadata features from data set helps to increase a,p,r,f 

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
512235
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Mean of cross-validations (Accuracy, Precision, Recall, F1 score):  [0.78607143 0.78805923 0.78607143 0.78577225]


# Evaluate on test set

In [10]:
# testing on the test set
functions_complete = True  # set to True once you're happy with your methods for cross validation
if functions_complete:
    print(testData[0])                                # have a look at the first test data instance
    classifier = trainClassifier(trainData) 
    testTrue = [t[1] for t in testData]               # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)    # classify the test data to get predicted labels
    a = accuracy_score(testTrue, testPred)            #evaluation of accuracy in test data
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluation of p,r,f in test data
    print("Done training!")
    print("accuracy: ", a)                                            #print accuracy after evaluating on test set
    print("Precision: %f\nRecall: %f\nF1 Score:%f" % finalScores[:3]) #print p,r,f after evaluating on test set

({700: 1.0, 2: 1.0, 4106: 0.14285714285714285, 31: 0.2857142857142857, 170: 0.14285714285714285, 168: 0.14285714285714285, 32: 0.14285714285714285, 117: 0.14285714285714285, 438297: 0.04, 438298: 0.04, 438299: 0.04, 438300: 0.04, 52406: 0.04, 49122: 0.04, 20715: 0.04, 200915: 0.04, 438301: 0.04, 177504: 0.04, 438302: 0.04, 438303: 0.04, 494: 0.04, 15973: 0.04, 370: 0.04, 220854: 0.04, 92: 0.04, 105: 0.04, 211: 0.04, 206: 0.04, 3885: 0.04, 2923: 0.04, 1694: 0.04, 4644: 0.04, 4572: 0.04}, 'fake review')
Training Classifier...
Done training!
accuracy:  0.8028571428571428
Precision: 0.806599
Recall: 0.802857
F1 Score:0.802254
