<a href="https://colab.research.google.com/github/DJongstra/Information_Retrieval_Assignment_3/blob/main/IR_PlagiarismDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup
- Import all needed libraries
- Google Drive mount


In [None]:
!pip install mmh3
!pip install snapy
import numpy as np
import seaborn as sns
import pandas as pd
import string, re
from snapy import MinHash, LSH
from google.cloud import storage
from google.colab import drive
drive.mount('/content/drive')

# Plagiarism Detection super class

In [None]:
class PlagiarismDetector:
    def __init__(self, n_gram, bands, rows):
        self.n_gram = n_gram
        self.bands = bands
        self.rows = rows
        self.signature_length = bands * rows
        self.articles = []
        self.M = None

    def read_articles(self, csv_file):
        for _, row in csv_file.iterrows():
            self.add_article(row['article'])

    def add_article(self, article):
        self.articles.append(self.preprocess_article(article))

    def preprocess_article(self, article):
        return article  # no preprocessing

    def construct_M(self):
        raise Exception("virtual")


class PlagiarismDetectorLib(PlagiarismDetector):
    def __init__(self, n_gram, bands, rows):
        super().__init__(n_gram, bands, rows)

    def construct_M(self):
        self.M = LSH(
            MinHash(
                self.articles,
                n_gram=self.n_gram,
                n_gram_type='term',
                permutations=self.signature_length
            ),
            range(len(self.articles)),
            no_of_bands=self.bands
        )


class PlagiarismDetectorImpl(PlagiarismDetector):
    def __init__(self, n_gram, bands, rows):
        super().__init__(n_gram, bands, rows)

    def preprocess_article(self, article: str):
        a = article.lower()  # lower case
        a = a.replace("n't", " not").replace("'ve", " have").replace("'s", "")  # rewrite contractions
        a = re.sub(" [^ ]*&amp[^ ]*", "", a)  # remove random "&amp" in text
        a = a.translate(str.maketrans('', '', string.digits))  # remove numbers?
        a = re.sub(" +", " ", a)  # remove double spaces
        a = a.translate(str.maketrans('', '', string.punctuation))  # remove ALL punctuation
        return a


    def construct_M(self):
        pass

    def get_hash_function(self, seed:int):
        random.seed(seed)
        seeds = [random.getrandbits(64) for i in range(self.signature_length)]
        return [lambda shingle:crc64(s + shingle) for s in seeds]


Read the data of the small news article set

In [None]:
df = pd.read_csv('/content/drive/MyDrive/IR-Assignment-3/data/news_articles_small.csv', index_col=0)
print(df.head())

All the articles in the small article dataset will be processed to a list of the terms in the articles. The words are lowercased and duplicates are removed by using a set.

In [None]:
articleList = []

for index, row in df.iterrows():
    temp = (row['article'].lower().split())
    temp = set(temp)
    articleList.append(temp)
    
print(articleList[0])

Calculate the jaccard index between each two documents in the data set by dividing the length of the intersection with the length of the union of the two sets. Save the values to a list to use later.

In [None]:
jaccardVals = []

for doc1idx in range(len(articleList)):
  doc1 = articleList[doc1idx]
  doc2idx = doc1idx + 1
  while doc2idx < len(articleList):
    doc2 = articleList[doc2idx]
    jaccard = len(doc1.intersection(doc2)) / len(doc1.union(doc2))
    jaccardVals.append(jaccard)
    doc2idx += 1

Plot the amount of values per bin, using a total of 50 bins.


In [None]:
jaccardVals = np.array(jaccardVals)
sns.histplot(jaccardVals, bins=50)


The previous graph showed a peak in a small range of the possible similarities. To see the distribution in other ranges, we leave the peak values out.

From this it is clear that there are also values in the higher ranges, however there are not a lot.

In [None]:
sns.histplot(jaccardVals[jaccardVals>0.2], bins=40)

# 2. Preprocessing of data, shingling, and minhashing to generate a signature matrix using news articles small.csv dataset.

import libraries

get content

In [None]:
articleList = []

for index, row in df.iterrows():
  #News_ID = int(row['News_ID']) # id
  article = row['article'] # lower case
  #article = article.lower() # lower case
  #article = article.replace("n't", " not").replace("'ve", " have").replace("'s","") # rewrite contractions
  #article = re.sub(" [^ ]*&amp[^ ]*","", article) # remove random "&amp"'s in text
  #article = article.translate(str.maketrans('', '', string.digits)) # remove numbers?
  #article = re.sub(" +"," ", article) # remove double spaces
  #article = article.translate(str.maketrans('', '', string.punctuation)) # remove ALL punctuation
  articleList.append(article)

print(articleList[0])

In [None]:
N_GRAM = 3
M_LENGTH = 40  # permutations/hash functions
BANDS = 10
print("Rows/band =", int(M_LENGTH/BANDS))

In [None]:
# Create MinHash object.
minhash = MinHash(articleList, n_gram=N_GRAM, n_gram_type='term', permutations=M_LENGTH)

In [None]:
# Create LSH model.
lsh = LSH(minhash, range(len(articleList)), no_of_bands=BANDS)

In [None]:
results = lsh.edge_list(min_jaccard=0.7, jaccard_weighted=True)

print(len(results), "near duplicates found")
print("DOC1", "DOC2", "JACCARD")
for doc1_id,doc2_id,jaccardVal in results:
  print(doc1_id ,"",doc2_id, "", jaccardVal)

In [None]:
# test doc contains 3 sentences from docs 0, 1 and 2
plagiarism_doc="Jorge Sosa won for the sixth time as the New York Mets snapped a four-game losing streak with a 3-0 victory over Detroit on Friday night. Sinn Fein, the Irish Republican Army's political wing, has no place in Northern Ireland politics, US Senator Ted Kennedy said Tuesday, explaining his refusal to meet this week with Gerry Adams, the group's leader. As awful as the news of priests forcing sex on altar boys is, to many of the faithful who sit in a pew each Sunday, the reaction of Roman Catholic Church leaders is even more shocking."
new_minhash = MinHash([plagiarism_doc], n_gram=N_GRAM, n_gram_type='term', permutations=M_LENGTH)
lsh.update(new_minhash, ["plagiarized_doc"])


In [None]:
results = lsh.edge_list(min_jaccard=0.4, jaccard_weighted=True)

print(len(results), "near duplicates found")
print("DOC1", "DOC2", "JACCARD")
for doc1_id,doc2_id,jaccardVal in results:
  print(doc1_id ,"",doc2_id, "", jaccardVal)

print(lsh.contains())