<a href="https://colab.research.google.com/github/DJongstra/Information_Retrieval_Assignment_3/blob/main/IR_PlagiarismDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Setup
- Import all needed libraries
- Google Drive mount


In [None]:
from google.cloud import storage
from google.colab import drive
drive.mount('/content/drive')

!pip install mmh3
!pip install snapy
!pip install xxhash
!pip install Random-Word-Generator

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import string, re, random, xxhash, time
from snapy import MinHash, LSH


# 2. Similarity Analysis: Ground Truth
Preprocessing of a document


In [None]:
# "We don't need to use a library, great!" -> ["we", "do", "not", "need", "to", "use", "a", "library", "great"]
def preprocess_document(document: str):
    doc = document.lower()  # lower case
    doc = doc.replace("n't", " not").replace("'ve", " have").replace("'s", "")  # rewrite contractions
    doc = re.sub(" [^ ]*&amp[^ ]*", "", doc)  # remove random "&amp" in text
    doc = doc.translate(str.maketrans('', '', string.digits))  # remove numbers?
    doc = re.sub(" +", " ", doc)  # remove double spaces
    doc = doc.translate(str.maketrans('', '', string.punctuation))  # remove ALL punctuation
    return doc.split()

Load the small article dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/IR-Assignment-3/data/news_articles_small.csv', index_col=0)
print(df)

In [None]:
df['article'].iloc[0]

All the articles in the small article dataset will be processed to a list of the terms in the articles. The words are lowercased and duplicates are removed by using a set (because order does not matter in this part of the analysis).

In [None]:
articleList = []

for _, row in df.iterrows():
    terms = preprocess_document(row['article'])
    articleList.append(set(terms))
    
print(articleList[0])

Calculate the jaccard index between each two documents in the data set by dividing the length of the intersection with the length of the union of the two sets. Save the values to a list to use later.

In [None]:
jaccardVals = []

for doc1idx in range(len(articleList)):
  doc1 = articleList[doc1idx]
  doc2idx = doc1idx + 1
  while doc2idx < len(articleList):
    doc2 = articleList[doc2idx]
    jaccard = len(doc1.intersection(doc2)) / len(doc1.union(doc2))
    jaccardVals.append(jaccard)
    doc2idx += 1

Plot the amount of values per bin, using a total of 50 bins.


In [None]:
jaccardVals = np.array(jaccardVals)
sns.histplot(jaccardVals, bins=50)


The previous graph showed a peak in a small range of the possible similarities. To see the distribution in other ranges, we leave the peak values out.

From this it is clear that there are also values in the higher ranges, however there are not a lot.

In [None]:
sns.histplot(jaccardVals[jaccardVals>0.2], bins=40)

# 3. LSH Implementation


## 3.1 Hash functions
The class RankHash uses the **xxhash** library to hash previously hashed shingles (64 bit) from sketches. We use a fixed size deterministic salt to create different hash functions h_0 to h_|M|. Even before reading our documents, we will generate a list of these hash functions based on a **seed**.

In [None]:
# convert integer to 8 bytes
def to_bytes(i: int):
    return int.to_bytes(i, length=8, byteorder='big', signed=False)

# convert hash digest (16 bytes) to an X byte integer
def to_int(digest: bytes, no_bytes=None):
    return int.from_bytes(digest[:no_bytes] if no_bytes else digest, byteorder='big', signed=False)

# ["rose", "is", "a"] -> 189939623769124324 (x bytes)
def hash_shingle(shingle: [], no_bytes=8):
    xxh = xxhash.xxh64()
    for word in shingle:
        xxh.update(word)
    return to_int(xxh.digest(), no_bytes)

# Hash function object that can be prepared
# If two objects share the same salt, they will also generate the same output for a given input
class RankHash:
    def __init__(self, salt: int):
        self.salt = to_bytes(salt)  # store key/salt

    # (hashed shingle) 189939623769124324 -> (rank) 134237347983861913
    def rank(self, hashed_shingle: int):
        xxh = xxhash.xxh64()
        xxh.update(self.salt)
        xxh.update(to_bytes(hashed_shingle))
        return to_int(xxh.digest())


## 3.2 LSH functionality

After all documents are added, the set is static and no documents can be added after. However, we can compare new content to our existing set using query_content(doc, s).
We could implement insertion, but we won't need it for this assignment.


In [None]:
# Basic functionality that the library as wel as our own implementation must handle
class LSHFunctionality:
    def __init__(self, n_gram, bands, rows, seed):
        self.n_gram = n_gram
        self.bands = bands
        self.rows = rows
        self.seed = seed
        self.signature_length = bands * rows
        self.original_documents = []

    # read directly from csv file
    def read_csv(self, csv_file: str):
        for _, row in pd.read_csv(csv_file, index_col=0).iterrows():
            self.original_documents.append(row['article'])


    # add documents as a list of strings
    def add_documents(self, documents: []):
        self.original_documents = documents

    # after adding documents, call compute to start the LSH
    def compute(self):
        raise Exception("virtual")

    # return all similarities >= s
    def get_all_similarities(self, s: float):
        raise Exception("virtual")

    # compare all docs to 'content' and return all where >= s
    def query_content(self, content: str, s: float):
        raise Exception("virtual")


## 3.3 Using a Library
We first start with implementing this functionality with the library

https://pypi.org/project/snapy/

In [None]:
class LSHLibrary(LSHFunctionality):
    def __init__(self, n_gram, bands, rows, seed):
        super().__init__(n_gram, bands, rows, seed)
        self.lsh = None

    def compute(self):
        self.lsh = LSH(
            MinHash(
                self.original_documents,
                n_gram=self.n_gram,
                n_gram_type='term',
                permutations=self.signature_length,
                seed=self.seed
            ),
            range(len(self.original_documents)),
            no_of_bands=self.bands
        )

    def get_all_similarities(self, s: float):
        return self.lsh.edge_list(min_jaccard=s, jaccard_weighted=True)

    # to query some content, we first have to add it to our set, minhash it and than query its id..
    def query_content(self, content: str, s: float):
        doc_id = len(self.original_documents)
        self.original_documents.append(content)

        # add to set (M)
        self.lsh.update(MinHash(
            [content],
            n_gram=self.n_gram,
            n_gram_type='term',
            permutations=self.signature_length,
            seed=self.seed
        ), [doc_id])

        # query matching documents
        return self.lsh.query(doc_id, min_jaccard=s)



## 3.4 Our own implementation of LSH
Our own implementation requires some additional methods to get all the functionality. 

In [None]:
class LSHImplementation(LSHFunctionality):
    def __init__(self, n_gram, bands, rows, seed=123):
        super().__init__(n_gram, bands, rows, seed)
        self.M = []  # a signature matrix for each document
        self.buckets = {}  # a dictionary with buckets
        self.similarities = {}  # document pairs are keys and the values are band hits
        random.seed(seed)  # prepare signature hash functions based on seed
        self.prepared_hash_functions = [RankHash(salt=random.getrandbits(64)) for _ in range(self.signature_length)]

    # Construct M, create buckets and compute similarities
    def compute(self):
        # Create a signature for each document
        self.M = [self.doc_to_signature(original_doc) for original_doc in self.original_documents]
        self.buckets = self.construct_buckets()
        self.similarities = self.construct_similarities()

    # Pre process the document, shingle its contents, hash the shingles and create the signature using minhash
    def doc_to_signature(self, original_doc):
        # Returns ["rose", "is", "a", "rose", "is", "a", "rose"]
        terms = preprocess_document(original_doc)
        # hash all the shingles
        hashed_shingles = set()
        for i in range(len(terms) - self.n_gram + 1):
            shingle = terms[i:i + self.n_gram]  # ["rose", "is", "a"]
            h = hash_shingle(shingle)  # 14164490265723533732547384763 (hash)
            hashed_shingles.add(h)
        # compute the minhash for every prepared ranking function (=signature length)
        return [min(hashed_shingles, key=h_r.rank) for h_r in self.prepared_hash_functions]  # <- sketch!

    # Construct a buckets(dictionary)
    # If doc1 has values (1,2,3) for a band, and doc2 also has values (1,2,3) for a band,
    # then they will end up in the same bucket.
    def construct_buckets(self):
        buckets = {}
        for doc_id in range(len(self.M)):
            signature = self.M[doc_id]
            for band in range(self.bands):
                # split signature into bands and use as key to bucket
                key = tuple(signature[band * self.rows:(band + 1) * self.rows])
                if key in buckets:
                    buckets[key].add(doc_id)
                else:
                    buckets[key] = {doc_id}
        return buckets

    # Construct all similarities by keeping track of all hits between documents
    # Result -> {(doc1, doc2):0.5, (doc2, doc7):0.3}
    def construct_similarities(self):
        candidate_pairs = set()  # set of candidate pairs
        for bucket in [list(b) for b in self.buckets.values()]:
            no_docs = len(bucket)
            # make all combinations between documents in bucket d(d-1)/2
            # need at least 2 docs in a bucket to create a candidate pair
            for i in range(no_docs - 1):
                for j in range(i + 1, no_docs):
                    candidate_pairs.add((bucket[i], bucket[j]))
        # map set of candidate pairs to dictionary
        return {(doc1, doc2): self.compare_signatures(self.M[doc1], self.M[doc2]) for (doc1, doc2) in candidate_pairs}

    def compare_signatures(self, sig1: [], sig2: []):
        return len([True for s1, s2 in zip(sig1, sig2) if s1 == s2]) / self.signature_length

    # Get all document id's where the similarity >= s
    def get_all_similarities(self, s: float):
        return [(doc1, doc2, round(sim, 2)) for ((doc1, doc2), sim) in self.similarities.items() if sim >= s]

    # Create a signature for the new document, and compare its bands with the bands hash table to find similar documents
    def query_content(self, content: str, s: float):
        candidates = set()
        signature = self.doc_to_signature(content)
        for band in range(self.bands):
            # split signature into bands and use as key to bucket
            key = tuple(signature[band * self.rows:(band + 1) * self.rows])
            if key in self.buckets:
                # add candidates
                candidates.update(self.buckets[key])

        # for each candidate, calculate the actual similarity
        result = []
        for doc in candidates:
            sim = self.compare_signatures(signature, self.M[doc])
            if sim >= s:
                result.append((doc, round(sim, 2)))
        return result



## 3.5 Simple Test

In [None]:
# example from https://pypi.org/project/snapy/
documents = [
    'Jupiter is primarily composed of hydrogen and a quarter of its mass being helium',
    'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.',
    'A helium atom has about four times as much mass as a hydrogen atom, so the composition changes when described as the proportion of mass contributed by different atoms.',
    'Jupiter is primarily composed of hydrogen and a quarter of its mass being helium',
    'A helium atom has about four times as much mass as a hydrogen atom and the composition changes when described as a proportion of mass contributed by different atoms.',
    'Theoretical models indicate that if Jupiter had much more mass than it does at present, it would shrink.',
    'This process causes Jupiter to shrink by about 2 cm each year.',
    'Jupiter is mostly composed of hydrogen with a quarter of its mass being helium',
    'The Great Red Spot is large enough to accommodate Earth within its boundaries.'
]
# changed 'much' to 'a lot' from document 5
plagiarized_doc = 'Theoretical models indicate that if Jupiter had a lot more mass than it does at present, it would shrink.'

# test both implementations
for constructor in [LSHImplementation, LSHLibrary]:
    lsh = constructor(n_gram=2, bands=4, rows=2, seed=999)
    lsh.add_documents(documents)
    lsh.compute()
    sim = 0.4
    print(f"\n========== {lsh.__class__.__name__} ==========")
    print(f"All similarities s>={sim}:", lsh.get_all_similarities(sim))
    print(f"Find similar documents to plagiarized doc with s>={sim} (doc 5 expected):",
          lsh.query_content(plagiarized_doc, sim))


## 3.6 Time comparison

In [None]:
print(df.head())

for constructor in [LSHImplementation, LSHLibrary]:
  lsh = constructor(n_gram=5, bands=5, rows=3, seed=17)

  print(f"\n========== {lsh.__class__.__name__} ==========")
  print("Read CSV.. ", end='')
  time_start = time.time()
  lsh.read_csv('/content/drive/MyDrive/IR-Assignment-3/data/news_articles_small.csv')
  print(f"({round((time.time()-time_start)/60, 2)} minutes)")

  print("Construct M.. ", end='')
  time_start = time.time()
  lsh.compute()
  print(f"({round((time.time() - time_start) / 60, 2)} minutes)")

  s = 0.6
  print(f"Find all similar documents with s >= {s}")
  time_start = time.time()
  sim = lsh.get_all_similarities(s=s)
  print(f"{len(sim)} similarities found ({round((time.time() - time_start) / 60, 2)} minutes): ", sim)


# 4. Evaluation

### Prepare Some Plagiarised Documents

In [None]:
from RandomWordGenerator import RandomWord
import numpy as np

In [None]:
# Replace first x words by random words
def create_plagiarised_doc_range(document, x):
  words=doc_1.split(" ")
  rw = RandomWord(max_word_size = 5,
                constant_word_size=True,
                include_digits=False,
                special_chars=r"@_!#$%^&*()<>?/\|}{~:",
                include_special_chars=False)
  for word in range(0, x):
    words[word] = rw.generate()
  return " ".join(words)

# Replace every xth word by a random word
def create_plagiarised_doc_step(document, x):
  words=doc_1.split(" ")
  rw = RandomWord(max_word_size = 5,
                constant_word_size=True,
                include_digits=False,
                special_chars=r"@_!#$%^&*()<>?/\|}{~:",
                include_special_chars=False)
  for word in range(0, len(words), x):
    words[word] = rw.generate()
  return " ".join(words)

# Randomly sample from a normal distribution
def create_plagiarised_doc_uniform(document, x):
  words=doc_1.split(" ")
  rw = RandomWord(max_word_size = 5,
                constant_word_size=True,
                include_digits=False,
                special_chars=r"@_!#$%^&*()<>?/\|}{~:",
                include_special_chars=False)
  for word in range(0, x):
    rand_index = int(np.random.uniform(0, len(words)))
    words[rand_index] = rw.generate()
  return " ".join(words)


def create_plagiarised_docs(document, duplicates):
  # Add docstring
  duplicates_dict = {}
  for i in range(1, duplicates+1):
    plagiarised_doc = create_plagiarised_doc_step(doc_1, i+1)
    duplicates_dict[f"plagiarised_doc_step_{i}"] = plagiarised_doc
  for i in range(1, duplicates+1):
    plagiarised_doc = create_plagiarised_doc_range(doc_1, i*10)
    duplicates_dict[f"plagiarised_doc_range_{i}"] = plagiarised_doc
  for i in range(1, duplicates+1):
    plagiarised_doc = create_plagiarised_doc_uniform(doc_1, i*10)
    duplicates_dict[f"plagiarised_doc_uniform_{i}"] = plagiarised_doc
  return duplicates_dict

In [None]:
doc_1 = df['article'].iloc[0]
doc_1

In [None]:
duplicates_dict = create_plagiarised_docs(doc_1, 20)
duplicates_dict

### Calculate Jaccard Similarity between Plagiarised Documents and Original Document

In [None]:
# Preprocess doc_1
doc_1_set= set(preprocess_document(doc_1))

In [None]:
# Calculate Jaccard Similarity between Doc_1 and its duplicates
jaccardVals = {}
for key in duplicates_dict:
  duplicate_terms = preprocess_document(duplicates_dict[key])
  duplicate_set = set(duplicate_terms)
  jaccard = len(doc_1_set.intersection(duplicate_set)) / len(doc_1_set.union(duplicate_set))
  jaccardVals[key] = jaccard

In [None]:
# Jaccard similarity between duplicates and document 1:
jaccardVals

In [None]:
jaccardVals_arr = np.array(list(jaccardVals.values()))
ax = sns.histplot(jaccardVals_arr, bins=10)
ax.set(xlabel='Jaccard Similarity', ylabel='Count')

### Evaluation

In [None]:
match_dict = {}
for b in range(50, 200, 50):
  for r in range(1,5):
    result_dict = {}
    lsh = LSHImplementation(n_gram=2, bands=b, rows=r, seed=17)
    lsh.add_documents([doc_1])
    lsh.compute() 
    for key in duplicates_dict:
      matches = lsh.query_content(duplicates_dict[key], s=0.1)
      if matches:
        result_dict[key] = matches
    
    match_dict[f"b_{b}_r_{r}"] = result_dict  
  print(match_dict)

In [None]:
precision_dict = {}
for s in range(7, 10, 1):
  result_dict = {}
  s = float(s/10)
  for test_key in match_dict:
    tp_counter = 0
    fp_counter = 0
    for result_key in match_dict[test_key]:
      doc, similarity_estimate = match_dict[test_key][result_key][0]
      if similarity_estimate > s and jaccardVals[result_key] > s:
        tp_counter += 1
      elif similarity_estimate > s and jaccardVals[result_key] < s:
        fp_counter += 1
    if tp_counter != 0:
      result_dict[test_key] = tp_counter/(fp_counter + tp_counter)
  precision_dict[f"s_{s}"] = result_dict
print(precision_dict)

In [None]:
for s in precision_dict:
  sorted_dict = {k: v for k, v in sorted(precision_dict[s].items(), key=lambda item: item[1])}
  names = list(sorted_dict.keys())
  values = list(sorted_dict.values())
  fig = plt.figure(figsize=(35,4))
  plt.title(f"Precision of Different Bands and Rows for Similarity Threshold: {s}")
  plt.bar(names, values)
  plt.xlabel('Bands and Rows')
  plt.ylabel('Precision')