<a href="https://colab.research.google.com/github/DJongstra/Information_Retrieval_Assignment_3/blob/main/IR_PlagiarismDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup
- Import all needed libraries
- Google Drive mount


In [None]:
!pip install mmh3
!pip install snapy
!pip install xxhash
import numpy as np
import seaborn as sns
import pandas as pd
import string, re, random, xxhash
from snapy import MinHash, LSH
from google.cloud import storage
from google.colab import drive
drive.mount('/content/drive')

## LSH functionality (super class)




In [None]:
class LSHFunctionality:
    def __init__(self, n_gram, bands, rows, seed):
        self.n_gram = n_gram
        self.bands = bands
        self.rows = rows
        self.seed = seed
        self.signature_length = bands * rows
        self.original_documents = []

    # read directly from csv file
    def read_csv(self, csv_file):
        for _, row in csv_file.iterrows():
            self.original_documents.append(row['article'])

    # add documents as a list of strings
    def add_documents(self, documents: []):
        self.original_documents = documents

    # after adding documents, call compute to start the LSH
    def compute(self):
        raise Exception("virtual")

    # return all similarities >= s
    def get_all_similarities(self, s: float):
        raise Exception("virtual")

    # compare all docs to 'content' and return all where >= s
    def query_content(self, content: str, s: float):
        raise Exception("virtual")



## LSH using a library

In [None]:
# We first start with implementing this functionality with the library
# https://pypi.org/project/snapy/
class LSHLibrary(LSHFunctionality):
    def __init__(self, n_gram, bands, rows, seed=123):
        super().__init__(n_gram, bands, rows, seed)
        self.lsh = None

    def compute(self):
        self.lsh = LSH(
            MinHash(
                self.original_documents,
                n_gram=self.n_gram,
                n_gram_type='term',
                permutations=self.signature_length,
                seed=self.seed
            ),
            range(len(self.original_documents)),
            no_of_bands=self.bands
        )

    def get_all_similarities(self, s: float):
        return self.lsh.edge_list(min_jaccard=s, jaccard_weighted=True)

    # to query some content, we first have to add it to our set, minhash it and than query its id..
    def query_content(self, content: str, s: float):
        doc_id = len(self.original_documents)
        self.original_documents.append(content)

        # add to set (M)
        self.lsh.update(MinHash(
            [content],
            n_gram=self.n_gram,
            n_gram_type='term',
            permutations=self.signature_length,
            seed=self.seed
        ), [doc_id])

        # query matching documents
        return self.lsh.query(doc_id, min_jaccard=s)


## Our own LSH implementation


### Hash function
The class HashFunction uses the **xxhash** library to hash to following data:
*   Shingles (list of strings) to 64 bit integers
*   The previously hashed shingles (64 bit) from sketches. We use a fixed size key to create different hash functions h_0 to h_|M|. Even before reading our documents, we will generate a list of these hash functions based on a **seed**.




In [None]:
# Hash function object that can be prepared
# If no key is provided this function will be a normal xxhash
# If two hash functions share the same key, they will also generate the same output for a given input
class HashFunction:
    def __init__(self, key: int = None):
        self.key = self.int_to_bytes(key) if key else b''  # store key

    # ["rose", "is", "a"] -> 189939623769124324
    def compute_strings(self, shingle: []):
        h = xxhash.xxh64()
        for word in shingle:
            h.update(word)
        return self.to_64_bit(h.digest())

    # (hashed shingle) 189939623769124324 ->  (rank) 134237347983861913
    def compute_int64(self, shingle: int):
        h = xxhash.xxh64(self.key)
        h.update(self.int_to_bytes(shingle))
        return self.to_64_bit(h.digest())

    # convert 64 bit integer to 8 bytes
    def int_to_bytes(self, i: int):
        return int.to_bytes(i, length=8, byteorder='big', signed=False)

    # convert 16 byte hash digest (128 bit) to a 64 bit integer (8 bytes)
    def to_64_bit(self, digest: bytes):
        return int.from_bytes(digest[:8], byteorder='big', signed=False)



## LSH class

After all documents are added, the set is static and no documents can be added after. We could implement this, but we won't need it for this assignment.
We can however compare new content to our existing set using query_content(doc, s)

In [None]:
class LSHImplementation(LSHFunctionality):
    def __init__(self, n_gram, bands, rows, seed=123):
        super().__init__(n_gram, bands, rows, seed)
        self.hash_tables = []  # one dictionary for each band
        self.M = []  # A signature vector for each document
        #  document pairs are keys, and the values are the amount of bands the documents correspond in.
        # if a doc pair is not present in the dictionary, then theur similarity is 0
        self.similarities = {}  
        # prepare signature hash functions based on a seed
        random.seed(seed)
        self.prepared_hash_functions = [HashFunction(key=random.getrandbits(64)) for _ in range(self.signature_length)]

    # String "We don't need to use a library, great!" -> "we do not need to use a library great"
    def preprocess_document(self, document: str):
        doc = document.lower()  # lower case
        doc = doc.replace("n't", " not").replace("'ve", " have").replace("'s", "")  # rewrite contractions
        doc = re.sub(" [^ ]*&amp[^ ]*", "", doc)  # remove random "&amp" in text
        doc = doc.translate(str.maketrans('', '', string.digits))  # remove numbers?
        doc = re.sub(" +", " ", doc)  # remove double spaces
        doc = doc.translate(str.maketrans('', '', string.punctuation))  # remove ALL punctuation
        return doc

    # -> "rose is a rose is a rose"
    # -> [["rose", "is", "a"], ["is", "a", "rose"], ["a", "rose", "is"], ["rose", "is", "a"], ["is", "a", "rose"]]
    # -> [44, 24, 17, 44, 24]
    # -> {44, 24, 17} use set to remove duplicates
    def doc_to_hashed_shingles(self, doc):
        terms = doc.split()
        hash_f = HashFunction()  # key=None
        no_shingles = len(terms) - self.n_gram + 1
        return set([hash_f.compute_strings(terms[i:i + self.n_gram]) for i in range(no_shingles)])

    # Pre process the document, shingle its contents, hash the shingles and create the signature using minhash
    def doc_to_signature(self, original_doc):
        # "rose is a rose is a rose"
        doc = self.preprocess_document(original_doc)
        # To set of shingles: {34, 727, 1, .., 934}
        hashed_shingles = self.doc_to_hashed_shingles(doc)
        signature = []
        for hash_f in self.prepared_hash_functions:
            # returns shingle for which h_i outputs the minimum value
            min_hash = min(hashed_shingles, key=hash_f.compute_int64)
            signature.append(min_hash)
        return signature  # <- we got our sketch!

    # Construct M, create Hash Tables and get Similarities
    def compute(self):
        print("Construct M...")
        self.M = self.construct_M()
        print("Construct hash tables...")
        self.hash_tables = self.construct_hash_tables()
        print("Construct similarities...")
        self.similarities = self.construct_similarities()

    def construct_M(self):
        M = []
        for original_doc in self.original_documents:
            signature = self.doc_to_signature(original_doc)
            M.append(signature)
        return M

    # Construct a hash table (dictionary) for each band, the row values in the signature is a key in the table
    # If doc1 has values (1,2,3) for band 2, and doc2 also has values (1,2,3) for band 2,
    # then they will end up in the same bucket.
    def construct_hash_tables(self):
        loading = LoadingBar(loops=self.bands*len(self.M))  # ignore

        bands_hash_tables = []
        for b in range(self.bands):
            hash_table = {}
            for doc_id in range(len(self.M)):
                signature = self.M[doc_id]
                key = tuple(signature[b * self.rows:(b + 1) * self.rows])
                if key in hash_table:
                    hash_table[key].append(doc_id)
                else:
                    hash_table[key] = [doc_id]
                loading.next()  # ignore
            bands_hash_tables.append(hash_table)
        return bands_hash_tables

    # Construct all similarities by keeping track of all hits between documents
    # Result -> {(doc1, doc2):5, (doc2, doc7):3}
    # If total_bands=10, then the jaccard for doc1&2 is 5/10 = 0.5
    def construct_similarities(self):
        loading = LoadingBar(loops=self.bands)  # ignore

        similarities = {}
        for b in range(self.bands):
            for sim_list in self.hash_tables[b].values():
                no_docs = len(sim_list)
                if no_docs > 1:
                    for i in range(no_docs - 1):
                        for j in range(i + 1, no_docs):
                            key = tuple([sim_list[i], sim_list[j]])
                            if key in similarities:
                                similarities[key] += 1
                            else:
                                similarities[key] = 1
            loading.next()  # ignore
        return similarities

    # Get all document id's where the jaccard >= s
    def get_all_similarities(self, s: float):
        # Now the jaccard value is the amount of band hits / total_bands, but only return if >= s
        return [(doc1, doc2, hits / self.bands)
                for ((doc1, doc2), hits) in self.similarities.items() if hits / self.bands >= s]

    # Create a signature for the new document, and compare its bands with the bands hash table to find similar documents
    def query_content(self, content: str, s: float):
        similarities = {}
        signature = self.doc_to_signature(content)
        for b in range(self.bands):
            key = tuple(signature[b * self.rows:(b + 1) * self.rows])
            if key in self.hash_tables[b]:
                # all documents that share the same row values in band b
                for doc_id in self.hash_tables[b][key]:
                    # keep counters how many times another doc has the same band values
                    if doc_id in similarities:
                        similarities[doc_id] += 1
                    else:
                        similarities[doc_id] = 1

        # Now the jaccard value is the amount of band hits / total_bands, but only return if >= s
        return [(doc, hits / self.bands)
                for (doc, hits) in similarities.items() if hits / self.bands >= s]


Read the data of the small news article set

In [None]:
df = pd.read_csv('/content/drive/MyDrive/IR-Assignment-3/data/news_articles_small.csv', index_col=0)
print(df.head())

All the articles in the small article dataset will be processed to a list of the terms in the articles. The words are lowercased and duplicates are removed by using a set.

In [None]:
articleList = []

for index, row in df.iterrows():
    temp = (row['article'].lower().split())
    temp = set(temp)
    articleList.append(temp)
    
print(articleList[0])

Calculate the jaccard index between each two documents in the data set by dividing the length of the intersection with the length of the union of the two sets. Save the values to a list to use later.

In [None]:
jaccardVals = []

for doc1idx in range(len(articleList)):
  doc1 = articleList[doc1idx]
  doc2idx = doc1idx + 1
  while doc2idx < len(articleList):
    doc2 = articleList[doc2idx]
    jaccard = len(doc1.intersection(doc2)) / len(doc1.union(doc2))
    jaccardVals.append(jaccard)
    doc2idx += 1

Plot the amount of values per bin, using a total of 50 bins.


In [None]:
jaccardVals = np.array(jaccardVals)
sns.histplot(jaccardVals, bins=50)


The previous graph showed a peak in a small range of the possible similarities. To see the distribution in other ranges, we leave the peak values out.

From this it is clear that there are also values in the higher ranges, however there are not a lot.

In [None]:
sns.histplot(jaccardVals[jaccardVals>0.2], bins=40)

# 2. Preprocessing of data, shingling, and minhashing to generate a signature matrix using news articles small.csv dataset.

import libraries

get content

In [None]:
articleList = []

for index, row in df.iterrows():
  #News_ID = int(row['News_ID']) # id
  article = row['article'] # lower case
  #article = article.lower() # lower case
  #article = article.replace("n't", " not").replace("'ve", " have").replace("'s","") # rewrite contractions
  #article = re.sub(" [^ ]*&amp[^ ]*","", article) # remove random "&amp"'s in text
  #article = article.translate(str.maketrans('', '', string.digits)) # remove numbers?
  #article = re.sub(" +"," ", article) # remove double spaces
  #article = article.translate(str.maketrans('', '', string.punctuation)) # remove ALL punctuation
  articleList.append(article)

print(articleList[0])

In [None]:
N_GRAM = 3
M_LENGTH = 40  # permutations/hash functions
BANDS = 10
print("Rows/band =", int(M_LENGTH/BANDS))

In [None]:
# Create MinHash object.
minhash = MinHash(articleList, n_gram=N_GRAM, n_gram_type='term', permutations=M_LENGTH)

In [None]:
# Create LSH model.
lsh = LSH(minhash, range(len(articleList)), no_of_bands=BANDS)

In [None]:
results = lsh.edge_list(min_jaccard=0.7, jaccard_weighted=True)

print(len(results), "near duplicates found")
print("DOC1", "DOC2", "JACCARD")
for doc1_id,doc2_id,jaccardVal in results:
  print(doc1_id ,"",doc2_id, "", jaccardVal)

In [None]:
# test doc contains 3 sentences from docs 0, 1 and 2
plagiarism_doc="Jorge Sosa won for the sixth time as the New York Mets snapped a four-game losing streak with a 3-0 victory over Detroit on Friday night. Sinn Fein, the Irish Republican Army's political wing, has no place in Northern Ireland politics, US Senator Ted Kennedy said Tuesday, explaining his refusal to meet this week with Gerry Adams, the group's leader. As awful as the news of priests forcing sex on altar boys is, to many of the faithful who sit in a pew each Sunday, the reaction of Roman Catholic Church leaders is even more shocking."
new_minhash = MinHash([plagiarism_doc], n_gram=N_GRAM, n_gram_type='term', permutations=M_LENGTH)
lsh.update(new_minhash, ["plagiarized_doc"])


In [None]:
results = lsh.edge_list(min_jaccard=0.4, jaccard_weighted=True)

print(len(results), "near duplicates found")
print("DOC1", "DOC2", "JACCARD")
for doc1_id,doc2_id,jaccardVal in results:
  print(doc1_id ,"",doc2_id, "", jaccardVal)

print(lsh.contains())