# Paraphrase detection using Machine Learning Techniques

#### NOTE: This is the same as the python files in /modules/plagiarism, but due to the absence of efficient Python PDF parsers, the web mining part was not tested fully, and hence, the project as such is non-functional from the UI. However, we have demonstrated 77% accuracy using Random Forest Classifier.

### Burra Abhishek (Registration Number: 19BCE1187) 
### Siddhant Roy (Registration Number: 19BCE1181)

In [1]:
# Text pre-processing stage
# module modules.plagiarism.preprocessing
  
from modules.plagiarism.preprocessing import vectorizedocument
from modules.plagiarism.preprocessing import parasplitter
from modules.plagiarism.preprocessing import removepunctuations
from modules.plagiarism.preprocessing import removebullets
from modules.plagiarism.preprocessing import tokenizer
import re
import string
from unidecode import unidecode


def preprocess(text):
    """ Preprocess a given text """
    
    s = vectorizedocument.text_to_list(text)
    t = []
    for i in s:
        t.append(parasplitter.sbd(i))
    t = removepunctuations.removesb(t)
    t = removebullets.removebullets(t)
    for i in t:
        s.append(tokenizer.removeWhiteSpace(i))
    return s


def preprocessURL(text):
    """ Preprocess a given text from a URL """
    
    s = vectorizedocument.text_to_list(text)
    t = []
    for i in s:
        t.append(parasplitter.sbd(i))
    t = removepunctuations.removesb(t)
    t = removebullets.removebullets(t)
    for i in t:
        s.append(tokenizer.removeWhiteSpace(i))
    return s


def preprocess_train(text):
    """ Preprocess a given text """
    
    s = vectorizedocument.text_to_list_all(text)
    t = []
    for i in s:
        t.append(parasplitter.sbd(i))
    t = removepunctuations.removesb(t)
    t = removebullets.removebullets(t)
    s = []
    for i in t:
        s.append(tokenizer.removeWhiteSpace(i))
    return s

In [2]:
# Training the model

import pandas as pd
from modules.plagiarism.preprocessing import preprocess
from modules.plagiarism.comparisons import ds_splitter
from modules.plagiarism.comparisons.classifiers import logistic, randomforest
from sklearn import metrics


def get_train_df(location):
    train_file = open(location, mode='r')
    unprocessed_train_data = list([example.split("\t") for example in train_file.readlines()])[1:]
    for i in unprocessed_train_data:
        i.pop(0)
        i[2] = int(i[2].replace("\n", ""))
    return unprocessed_train_data


def process_dataset(dataset):
    for i in dataset:
        i[0] = preprocess.preprocess_train(i[0])[0]
        i[1] = preprocess.preprocess_train(i[1])[0]
    return dataset

# PAWS QQP dataset was used to train the models.
y = process_dataset(get_train_df('modules/plagiarism/comparisons/datasets/final/train.tsv'))
y1 = process_dataset(get_train_df('modules/plagiarism/comparisons/datasets/final/test.tsv'))

In [3]:
pd.DataFrame(y)

Unnamed: 0,0,1,2
0,in paris in october 1560 he secretly met the e...,in october 1560 he secretly met with the engli...,0
1,the nba season of 1975 76 was the 30th season ...,the 1975 76 season of the national basketball ...,1
2,there are also specific discussions public pro...,there are also public discussions profile spec...,0
3,when comparable rates of flow can be maintaine...,the results are high when comparable flow rate...,1
4,it is the seat of zerendi district in akmola r...,it is the seat of the district of zerendi in a...,1
...,...,...,...
49396,our school is of spiritual and spiritual love ...,our school is of the temporal and the spiritua...,0
49397,she was in cork on june 24 and arrived on 8 ju...,she was at cork on 24 june and arrived in the ...,1
49398,cornelia stuyvesant vanderbilt george and edit...,john john f a cecil the only child of george a...,0
49399,the third season was premiered on 7 june 2010 ...,the fourth season was premiered on june 7 2010,0


In [4]:
pd.DataFrame(y1)

Unnamed: 0,0,1,2
0,this was a series of nested angular standards ...,this was a series of nested polar scales so th...,0
1,his father emigrated to missouri in 1868 but r...,his father emigrated to america in 1868 but re...,0
2,in january 2011 the deputy secretary general o...,in january 2011 fiba asia deputy secretary gen...,1
3,steiner argued that in the right circumstances...,steiner held that the spiritual world can be r...,0
4,luciano williames dias born july 25 1970 is a ...,luciano williames dias born 25 july 1970 is a ...,0
...,...,...,...
7995,the company has branches in tokyo based in the...,the company has branches in tokyo based in sai...,1
7996,muara teweh abbreviated mtw is a city located ...,teweh abbreviated mtw is a city located in the...,0
7997,the modern coat of arms of bavaria was designe...,the modern coat of arms of bavaria was designe...,1
7998,former president brenda kuecks received a clea...,in 2013 former president brenda kuecks receive...,0


In [5]:
# Lexical analysis

# Fuzzy similarity

from fuzzywuzzy import fuzz
from fuzzywuzzy import process


def fuzzysimilarity(dataframe):
    """
    Compares two strings in a Pandas DataFrame: dataframe[0], dataframe[1]
    String comparison using Levenshtein distance
    to calculate distance between sequences.
    We need all ratios
    """

    # Fuzz ratio: similarity of entire string
    dataframe['Fuzz Ratio'] = dataframe.apply(lambda x: fuzz.ratio(str(x[0]), str(x[1])), axis=1)

    # Fuzz Token Set Ratio: similarity of each token in the string
    # The word order does not matter, unlike in fuzz ratio
    dataframe['Fuzz Token Set Ratio'] = dataframe.apply(
        lambda x: fuzz.token_set_ratio(str(x[0]), str(x[1])), 
        axis=1
        )

    return dataframe


# N-gram features and Jaccard similarity

from nltk.util import ngrams

def jaccardDistance(x, y, n):
    _w1, _w2, a, b = common_ngrams(x, y, n)
    l = len(set(a).union(set(b)))
    if l == 0:
        return 1
    else:
        return len(set(a).intersection(set(b))) / l


def ngrams_ratio(a, b, n):
    w1, w2, ngrams1, ngrams2 = common_ngrams(a, b, n)
    return len(set(ngrams1).intersection(set(ngrams2))) / (len(w1) + len(w2))


def common_ngrams(a, b, n):
    # Split the sentences into words
    w1 = a.split()
    w2 = b.split()
    # Get the n grams
    ngrams1 = list(ngrams(w1, n))
    ngrams2 = list(ngrams(w2, n))
    return w1, w2, ngrams1, ngrams2


def ngram_features(df):
    df['Common Unigram Ratio'] = df.apply(
        lambda x: ngrams_ratio(str(x[0]), str(x[1]), 1), 
        axis=1
        )
    df['Common Bigram Ratio'] = df.apply(
        lambda x: ngrams_ratio(str(x[0]), str(x[1]), 2), 
        axis=1
        )
    df['Common Trigram Ratio'] = df.apply(
        lambda x: ngrams_ratio(str(x[0]), str(x[1]), 3), 
        axis=1
        )
    df['Unigram Jaccard Distance'] = df.apply(
        lambda x: jaccardDistance(str(x[0]), str(x[1]), 1), 
        axis=1
        )
    df['Bigram Jaccard Distance'] = df.apply(
        lambda x: jaccardDistance(str(x[0]), str(x[1]), 2), 
        axis=1
        )
    df['Trigram Jaccard Distance'] = df.apply(
        lambda x: jaccardDistance(str(x[0]), str(x[1]), 3), 
        axis=1
        )
    return df


# Normalized Longest Common Subsequence

def NLCS(sentence1, sentence2):
    """ Determine the length of the NLCS of two sentences """

    # Get each individual word from each of the sentences.
    word1 = sentence1.split()
    word2 = sentence2.split()

    # Get the number of words from each sentence.
    l1 = len(word1)
    l2 = len(word2)

    # Initialize the nested list to store all the 
    # subsequence similarity values
    a = []
    for i in range(l1 + 1):
        l = []
        for j in range(l2 + 1):
            l.append([])
        a.append(l)

    for i in range(l1 + 1):
        for j in range(l2 + 1):
            # Nothing to compare initially
            if i == 0 or j == 0:
                a[i][j] = 0
            # Matching words
            # Add 1 to the subsequence
            elif word1[i - 1] == word2[j - 1]:
                a[i][j] = a[i - 1][j - 1] + 1
            # Words do not match
            # Get the maximum value of its previous neighbours
            else:
                a[i][j] = max(a[i-1][j], a[i][j-1])
    
    # a[l1][l2] contains the length of the 
    # longest common subsequence of X[0..n-1] & Y[0..m-1] 
    lf = a[l1][l2]/(len((set(word1).union(set(word2)))))
    
    # lf is the length of the Normalized longest common subsequence
    return lf


def apply_nlcs(dataframe):
    dataframe["Normalized Longest Common Subsequence"] = dataframe.apply(lambda x: NLCS(x[0], x[1]), axis=1)
    return dataframe

In [6]:
# Semantic Analysis

# Normalized Word Mover's Distance

from gensim import models

# If this code fails, you need to download the GloVe dataset.
# Open a file explorer, then go to modules/plagiarism/comparisons/datasets/
# Create a new folder "glove6b" and go to the following link to download the dataset:
# https://nlp.stanford.edu/data/wordvecs/glove.6B.zip
# Then use the 50d model. If any other model is used, change the file name accordingly.
# Before you can use the model, open a terminal or its equivalent in the same folder, then run
# python -m gensim.scripts.glove2word2vec --input  glove.6B.50d.txt --output glove.6B.50d.w2vformat.txt
w = models.KeyedVectors.load_word2vec_format(
    'modules/plagiarism/comparisons/datasets/glove6b/glove.6B.50d.w2vformat.txt', binary=False)

# w.init_sims may throw a deprecation warning, use fill_norms instead.
#w.init_sims(replace=True)
w.fill_norms()

def nwmd(dataframe):
    dataframe["Normalized Word Mover's Distance"] = dataframe.apply(
        lambda x: w.wmdistance(x[0], x[1]), 
        axis=1
        )
    return dataframe

In [7]:
# Apply these features to the dataset
def get_features(data_list):
    x = pd.DataFrame(data_list)
    x = (apply_nlcs(fuzzysimilarity(x)))
    x = (ngram_features(x))
    x = (nwmd(x))
    #x = (generateoverlaps.syntactics(x))
    return x

In [8]:
# X - Y split and scaling
import pandas as pd
from sklearn.preprocessing import StandardScaler


def x_y_split(df):
    Y = pd.DataFrame(df[2])
    df.drop(2, inplace=True, axis=1)
    return df, Y


def preprocess_ds(df):
    df.drop(1, inplace=True, axis=1)
    df.drop(0, inplace=True, axis=1)
    return df


def scale_df(df):
    scaler = StandardScaler()
    v = df.values.tolist()
    df_scaled = pd.DataFrame(scaler.fit_transform(v))
    return df_scaled


def dsSplitter(l):
    x = get_features(l)

    # At this stage we have our dataset
    x = preprocess_ds(x)

    # Scaling the dataset
    X, Y = x_y_split(x)
    X = pd.DataFrame(scale_df(X))
    y = Y.values.tolist()
    Y = []
    for i in y:
        Y.append(i[0])
    return X, Y

In [9]:
X, Y = dsSplitter(y)

In [10]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.249096,-0.202392,-0.368471,0.154474,-0.021533,0.108093,-0.268702,-0.291004,-0.145425,-0.315959
1,-1.273308,0.554983,-0.069904,-1.065134,0.013835,-0.045936,1.100659,0.419766,-0.067558,-0.369964
2,-0.391916,0.554983,-0.163206,-0.049709,-1.292382,-1.509848,0.244808,-1.238698,-1.292003,0.663582
3,-1.834194,0.554983,-1.702688,1.244928,-0.831929,-1.080914,0.387450,-0.859620,-0.977619,-0.326633
4,0.088843,0.554983,0.349954,0.801053,-0.046452,-0.523299,1.100659,-0.072306,-0.483586,0.336392
...,...,...,...,...,...,...,...,...,...,...
49396,0.249096,0.554983,1.257854,-1.289390,-0.883005,-0.636047,0.442312,-0.909065,-0.713536,0.604907
49397,-0.552169,0.554983,-0.603058,0.404031,-1.209320,-1.772928,0.489337,-1.238698,-1.501593,-0.148548
49398,-1.353435,0.302525,-1.469433,-0.366659,-0.644987,-1.432472,0.322613,-0.859620,-1.321945,0.164255
49399,-2.635460,0.554983,-2.272866,-2.272667,-2.196686,-1.096134,-3.178594,-1.897963,-1.045963,2.570832


In [11]:
X1, Y1 = dsSplitter(y1)

In [12]:
# The machine learning part

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


def train_lr(Xtrain, Ytrain):
    lrModel = LogisticRegression(max_iter=10000)
    lrModel.fit(Xtrain, Ytrain)
    return lrModel


def rfModel(Xtrain, Ytrain):
    rf = RandomForestClassifier()
    rf.fit(Xtrain, Ytrain)
    return rf

In [13]:
# The actual training part, where the models are trained using the training data

lrModel = train_lr(X, Y)
rfModel = rfModel(X, Y)

In [14]:
Y2 = lrModel.predict(pd.DataFrame(X1))

Y3 = rfModel.predict(pd.DataFrame(X1))

X2 = X1.values.tolist()
for i in range(0, len(y1)):
    # Obvious lazy plagiarism is sometimes missed.
    if y1[i][0] == y1[i][1]:
        Y2[i] = 1
        Y3[i] = 1

print("Logistic Regression Accuracy: ", metrics.accuracy_score(Y1, Y2))
print("Logistic Regression Precision: ", metrics.precision_score(Y1, Y2))
print("Logistic Regression Recall: ", metrics.recall_score(Y1, Y2))
print("Logistic Regression F1 score: ", metrics.f1_score(Y1, Y2))
print(" ")
print("Random Forest Accuracy: ", metrics.accuracy_score(Y1, Y3))
print("Random Forest Precision: ", metrics.precision_score(Y1, Y3))
print("Random Forest Recall: ", metrics.recall_score(Y1, Y3))
print("Random Forest F1 score: ", metrics.f1_score(Y1, Y3))

Logistic Regression Accuracy:  0.7215
Logistic Regression Precision:  0.7235133287764867
Logistic Regression Recall:  0.5986990950226244
Logistic Regression F1 score:  0.6552151036830702
 
Random Forest Accuracy:  0.768625
Random Forest Precision:  0.7858839497794368
Random Forest Recall:  0.6549773755656109
Random Forest F1 score:  0.7144840351689032
