In [None]:
# Data Analytics Project 1 - Christian Altrichter, Francesco Huber 
# Project Nr. 15 Suspicious Passages

# ALL CODE WRITTEN AND PRODUCED BY THE TEAM

In [None]:
# Imports
import csv
import kshingle as ks
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import reduce_lines as rd
import re
import string

from datasketch import MinHashLSH, MinHash, MinHashLSHForest
from nltk.stem.porter import *

## CORE COMPUTATIONS

In [None]:
# CORE COMPUTATIONS - Core statistical computation on the text files

def get_paths_from_current_dir(input_path, ending = "txt"):
    """
        Walks the current directory to extract all text files as paths for preprocessing

    Args:
        input_path (String): String of path, to be passed and processed then later by extract_data
        ending (String): Defines which files should be extracted based on the ending of the files. 

    Returns:
        list: returns a list of all paths to the individual files
    """
    list_of_paths = []
    for root, dirs, files in os.walk(input_path, topdown=False):
        for name in files:
            if name.endswith(ending):  
                list_of_paths.append((os.path.join(root, name)))
    list_of_paths.sort()
    print(list_of_paths)
    return list_of_paths


def text_processor(data_file):
    """
        extracts the following information from each text:
            number of lines, unique vocabularies, the entire vocabulary list, 
            unique characters, all characters and the text as string
    Args:
        data_file (String): Text transformed as string 

    Returns:
        int, set, list, set, list, string: returns a tuple of the relevant information of the 
        texts. 
    """
    line_counter = 0
    unique_char = set()
    characters = []
    unique_vocab = set()
    vocabulary = []
    text = ""
    word = ""
    for line in data_file:
        for char in line:
            unique_char.update(char)
            characters.append(char)
        line_counter += 1
        text += " " + line.strip()
    
    vocabulary = text.split(" ")
    unique_vocab.update(vocabulary)
    return line_counter, unique_vocab, vocabulary,  unique_char, characters, text.lower()


def process_all(root_dir):
    """ 
        Processes the text files with the above methods and transforms the information into a usable
        dataframe
    
    Args:
        path_to_text_file (String): Takes as argument the root source directory to be processed.
        
    Returns:
        pandas.DataFrame: returns a data frame summarizing the individual statistics
    """
    list_of_paths = get_paths_from_current_dir(root_dir)
    print("process all order", list_of_paths)
    doc_info = pd.DataFrame(columns=['File Name', "Line Counter", "Length of unique vocabulary", "Length of all vocabulary", "Length of unique characters", "Length of all characters"])
    doc_raw = []

    for i, elem in enumerate(list_of_paths):
        file_name = elem.split("/")[-1].split(".")[0]
        line_counter, unique_vocab, vocabulary,  unique_char, characters, text = text_processor(open(elem))
        doc_raw.append([file_name, line_counter, unique_vocab, vocabulary,  unique_char, characters, text])
        doc_info.loc[i] = file_name, line_counter, len(unique_vocab), len(vocabulary), len(unique_char), len(characters)
    
    return doc_info, doc_raw

def populate_CSV(dataFrame, fileName):
    """
        Populates a XLS file from the data frame to be used then for visualization purposes in Tableau

    Args:
        dataFrame (pandas.DataFrame): A populated data frame with all texts information
        fileName (String): Specifies how the files should be saved in the current directory 
    """
    dataFrame.to_csv(fileName + ".csv")
    


# EXAMPLE EXECUTION:
# Below is a sampple core computation execution that will give you the basic statistical analysis of the text files
# NOTE: THE INPUT PATH MUST BE UPDATED TO THE RESPECTIVE CORPUS OF SUSPICIOUS PASSAGES

# input_path_original = "./31.Suspicious_Passages/Corpus"
# data_frame_original_data, raw_orig = process_all(input_path_original)


# dataframe_list = []
# dataframe_list.append(data_frame_original_data)


# rawdata_list = []
# rawdata_list.append(raw_orig)


# populate_CSV(data_frame_original_data, "clean_files_0")

# print("Statistical analysis complete")

# df1 = pd.read_csv('clean_files_0.csv')
# print(df1.describe())
    
    

## REDUCE LINES

In [None]:
# REDUCE LINES

# Clean files of unncesessary passages (e.g. table of content, tables etc.) based on average word count per line.
# Thus, as example - if we reduce the text by 0.5, then we remove all lines in which the word count is 50% less than 
# the mean count of word count per line


def get_and_process_text(text_name):
    """
    Opens Text, removes punctuation and capitalization then counts chars per line,
    produces two lists, one for char per line and one with contents of the line
    :param text_name: string corresponding to name of the source text
    :return: lines: a list of lines appended as strings
            line_count: a list of integers representing the amount of char per line
    """
    lines = []
    line_count = []
    text = open(text_name)

    for line in text:
        line = line.translate(str.maketrans('', '', string.punctuation))
        line = line.replace('\t', ' ')
        line = re.sub("\s\s+", " ", line)
        lines.append(line.lower())
        line_count.append(len(line))

    return lines, line_count


def get_frame(line_count, mean_amount=1.0):
    """
    Produces a DataFrame, removing lines that do not pass our threshold
    :param line_count: a list of integers
           mean_amount: float, mean multiplier

    :return: edited_frame: a dataframe filtered by valid lines
        """
    frame = pd.DataFrame(data=line_count, columns=["char_count"])
    mean_frame = frame["char_count"].mean()
    std_frame = frame["char_count"].std()
    edited_frame = frame[frame.char_count > (mean_frame * mean_amount)]
    return edited_frame


def plot_frame(frame_name, fig_size=None):
    """
    produces a bar chart
    :param frame_name: string name of our frame
    :param fig_size: touple(int,int) size of our figure plotted
    """
    plt.figure(figsize=fig_size)
    plt.bar(np.arange(len(frame_name['char_count'].values)),
            frame_name['char_count'].values, align='center', alpha=1, width=1.0)
    plt.show()


def extract_text(text_name, mean_amount):
    """
    Compose Edited texts, iterate over all txt files and cut contents
    Finally saving the new file
    :param text_name: string, source file name
    :param mean_amount: float, mean multiplier
    :return: file, the created reference file
    """
    line_content, char_amount = get_and_process_text(text_name)
    frame = get_frame(char_amount, mean_amount)

    # get valid line ids and compose actual lines
    text = ''
    for idx in frame.iterrows():
        cleaned_line = line_content[idx[0]]
        text += cleaned_line

    # creates a new file called 'xxx'.txt and write some text into it
    number_name = (text_name.split('.')[-2])[-5:] # maintain only number
    new_file_name = number_name + '_clean_' + str(int(mean_amount * 100)) + '%_mean.txt'
    with open(new_file_name, 'w') as file:
        file.write(text)
    # print("would save a file called ", new_file_name)
    return file


def generate_files(mean_average, input_path):
    """
        produces the folder with all the needed files inside for comparison
    
    Args:
        mean_average: float, mean multiplier
    """
    dir_name = 'clean_files_' + str(int(mean_average * 100)) + '%_mean'
    os.mkdir(dir_name)
    corpus_path = input_path
    for _, _, files in os.walk(corpus_path, topdown=False):
        for name in files:
            if name.endswith(".txt"):
                full_name = corpus_path + '/' + name
                file = rd.extract_text(full_name, mean_average)
                os.rename(file.name, dir_name + '/' + file.name)
                

# EXAMPLE EXECUTION:
# Below is a sampple file cleaning execution that will give you three adjusted corpuses based on the percentile mean reduction
# NOTE: THE INPUT PATH MUST BE UPDATED TO THE RESPECTIVE CORPUS OF SUSPICIOUS PASSAGES

# input_path = "./31.Suspicious_Passages/Corpus"
    
# generate_files(.25, input_path)
# generate_files(.5, input_path)
# generate_files(.75, input_path)

## CLEAN FILES GENERATOR

In [None]:
# CLEAN FILE GENERATOR

def generate_files(mean_average, path):
    """
        produces the folder with all the needed files inside for comparison
    
    Args:
        mean_average: float, mean multiplier
    """
    dir_name = 'clean_files_' + str(int(mean_average * 100)) + '%_mean'
    os.mkdir(dir_name)
    corpus_path = path
    for _, _, files in os.walk(corpus_path, topdown=False):
        for name in files:
            if name.endswith(".txt"):
                full_name = corpus_path + '/' + name
                file = rd.extract_text(full_name, mean_average)
                os.rename(file.name, dir_name + '/' + file.name)



# EXAMPLE EXECUTION:
# Below is a sampple clean file generator execution
# NOTE: THE INPUT PATH MUST BE UPDATED TO THE RESPECTIVE CORPUS OF SUSPICIOUS PASSAGES      

# path = "31.Suspicious_Passages/corpus"

# generate_files(.25, path)
# generate_files(.5, path)
# generate_files(.75, path)


## SIMILARITY COMPUTATION - THRESHOLD

In [None]:
# SIMILIARITY COMPUTATION - BINARY OUTPUT AND FOREST OUTPUT BASED ON THRESHOLD


def stemm_vocabulary(rawdata_list):
    """takes the raw with the following collumns:
        [file_name, line_counter, unique_vocab, vocabulary,  unique_char, characters, text])
        and extracts the vocabulary for stemming which is then passed onto the shingling.
    Args:
        dataframe_list (list(pandas.DataFrame)): takes a list of previously computed dataframes and extracts the list of vocbularies
    Returns:
        list: returns a list where the vocabulary (column index 3) has been stemmed
    """
    stemmer = PorterStemmer()
    stemmed_vocab_list = rawdata_list.copy()
    stemmed_vocab = [stemmer.stem(vocab) for vocab in stemmed_vocab_list]
    return stemmed_vocab


def k_shingle(stemmed_voc, k):
    
    """
    Creates a shingle set based on the k amount from a list of strings
    Args:
        stemmed_voc (list of strings): a list of stemmed vocabulary
    Returns:
         Set: sets of shingles
    """
    # Recreate original text with stemmed vocab
    txt = ""
    for s in stemmed_voc:
        txt += " " + s

    txt.strip()
    shingles = ks.shingleset_range(txt, n_min=k, n_max=k)
        
    return shingles


def shingle_set(rootpath, k):
    """
    Obtains a set of files in a path,
    produces a list of set of shingles
    :param rootpath: (string) the path of a directory
    :param k: (integer) the size of every shingle
    :return shingle_all: (list of sets of strings) list of set per document
    """
    list_of_paths = get_paths_from_current_dir(rootpath)
    list_of_stemmed = []
    shingle_all = []

    for elem in list_of_paths:
        _, _, vocabulary,  _, _, _ = text_processor(open(elem))
        stemmed_vocab = stemm_vocabulary(vocabulary)
        list_of_stemmed.append(stemmed_vocab)
        

    for elem in list_of_stemmed:
        shingle_all.append(k_shingle(elem, k))

        print("In shingle set - this will take some time")
    
    return shingle_all
    

def get_min_hashes(shingle_list, nperm=128):
    """
    Creates and returns the minHash based off of the passed shingle_list
    :param shingle_list: (list of sets of shingles)
    :return min_hashes: (list of MinHash Objects) a list of MinHashes
    """
    min_hashes = []
    for _ in shingle_list:
        x = MinHash(num_perm=nperm)
        min_hashes.append(x)

    for i, shingl_list in enumerate(shingle_list):
        for shingle in shingl_list:
            min_hashes[i].update(shingle.encode('utf8'))
        
        print("In minHashed - this will take some time")

    return min_hashes


def get_lsh(shingle_list, hash_list=None, threshold=.8, num_perms=128, list_names = None):
    """
    Produces, initializes and gets the LSH Object according to the arguments
    :param shingle_list: (list of sets of shingles)
    :param hash_list: (list of MinHashes)
    :param threshold: (float) threshold amount for similarity
    :param num_perms: (integer) number of permutations
    :return lsh: (LSH Object) the LSH Object
    :return min_hashes: (list of MinHash Objects) a list of MinHashes
    """
    min_hashes = hash_list

    # If not to provided with min hashes
    if hash_list is None:
        shingle_list = [list(x) for x in shingle_list]
        min_hashes = get_min_hashes(shingle_list, num_perms)

    lsh = MinHashLSH(threshold, num_perm=num_perms)
    name = list_names if list_names != None else ["m"] * len(min_hashes)

    for i, my_hash in enumerate(min_hashes):
        if name[i] == 'm':
            name[i] = 'm'+str(i)
        lsh.insert(name[i], my_hash)

        print("In get LSH - this will take some time")
        
    return lsh, min_hashes


def get_lsh_data_binary(directory, threshold=.5, shingle_size=5, list_names = None, num_perms=128):
    """
    
    Getter method to obtain the lsh Objects created based off of the directory and specified threshold
    :param directory: (string) the path of the directory ot be explored
    :param threshold: (float) threshold amount for similarity
    :return my_lsh: (LSH Object) the lsh Object
    :return my_lsh_forest: (LSHForest Object) the lshForest Object
    :return my_hashes: (list of MinHash Objects) the list of all MinHashes used for later querying
    """

    shingles = shingle_set(directory, shingle_size)                       
    my_lsh, my_hashes = get_lsh(shingle_list=shingles, threshold=threshold, num_perms=num_perms, list_names = list_names)  
    return my_lsh, my_hashes


def compute_plagiarism_binary(directory, thresh = 0.3, shingle_size=7, num_perms=128 ):
    """Outputs the binary dataframe in binary numbers whether a document has a suspicious passage
    compared to the other documents that are within the corpus. 

    Args:
        directory (string): takes a directory to all the documents to be compared
        top_k (int, optional): _description_. Defaults to 2.
        thresh (float, optional): _description_. Defaults to 0.5.

    Returns:
        _type_: _description_
    """
    list_paths = get_paths_from_current_dir(directory)
    names = [x.split("/")[-1].split(".")[0] for x in list_paths]
    my_lsh, my_hashes = get_lsh_data_binary(directory, threshold=thresh, list_names = names, shingle_size=shingle_size, num_perms=num_perms)
    
    global_lsh = []
    
    for i, elem in enumerate(my_hashes):
        global_lsh.append([names[i], my_lsh.query(elem)])
    
    # DataFrame for LSH
    df_LSH_Binary = pd.DataFrame(global_lsh, columns=["DocumentName", "Similar Files - Threshold"])
    #df_LSH_Binary.to_csv(directory + 'LSHThreshold03_s5.csv')
    df_LSH_Binary.to_csv(directory + 'LSHThreshold' + str(thresh*10) + '_Ksize' + str(shingle_size) + "_nperm" + str(num_perms) +'.csv')
    
    df_LSH_Threshold = df_LSH_Binary.copy()
    df1 = df_LSH_Binary["Similar Files - Threshold"].apply(lambda x: 1 if len(x) > 1 else 0)
    df_LSH_Binary["Similar Files - Threshold"] = df1
    
    binary_file_name = directory + 'LSHbinary' + '_T0' + str(thresh*10) + '_Ksize' + str(shingle_size) + "_nperm" + str(num_perms) +'.csv'
    df_LSH_Binary.to_csv(binary_file_name)
    
    print("DATA FRAME BASED ON THRESHOLD")
    print(df_LSH_Threshold)
    print("-"*50)
    print("DATA FRAME BASED ON THRESHOLD - Binary")
    print(df_LSH_Binary)
    print("-"*50)
    
    return df_LSH_Threshold, df_LSH_Binary, binary_file_name

 

# EXAMPLE EXECUTION:
# Below is a sample compute plagiarism execution that will give you the LSH output in a binary format and in a forest format based on a threshold
# NOTE: THE INPUT PATH MUST BE UPDATED TO THE RESPECTIVE CORPUS OF SUSPICIOUS PASSAGES

# input_path = "./31.Suspicious_Passages/Corpus"
    
# compute_plagiarism_binary(input_path, thresh = 0.3, shingle_size = 8, num_perms = 128)

## SIMILIARITY COMPUTATION - TOP-K

In [None]:
# SIMILIARITY COMPUTATION - BINARY OUTPUT AND FOREST OUTPUT BASED ON TOP-K AND THRESHOLD

def get_lsh_forest(hash_list, num_perms=128, list_names=None):
    """
    Produces, initializes and gets the LSHForest Object according to the arguments
    :param hash_list: (list of MinHashes)
    :param num_perms: (integer) number of permutations
    :return lsh: (LSH Object) the LSHForest Object
    """
    lsh = MinHashLSHForest(num_perm=num_perms)

    if list_names == None:
        names = ["m"] * len(hash_list)
        names = [names[i] + str(i) for i in range(len(names))]
    else:
        names = list_names

    for i, my_hash in enumerate(hash_list):
        lsh.add(names[i], my_hash)
    lsh.index()

    return lsh


def get_lsh_data(directory="test_dir", threshold=.5, shingle_size=3, n_perm=128):
    """
    Getter method to obtain the lsh Objects created based off of the directory and specified threshold
    :param directory: (string) the path of the directory ot be explored
    :param threshold: (float) threshold amount for similarity
    :return my_lsh: (LSH Object) the lsh Object
    :return my_lsh_forest: (LSHForest Object) the lshForest Object
    :return my_hashes: (list of MinHash Objects) the list of all MinHashes used for later querying
    """

    names = [x.split("/")[-1].split(".")[0] for x in get_paths_from_current_dir(directory)]
    print("names", names)
    shingles, united_shingles = shingle_set(directory, shingle_size, names)  # Get Shingles
    my_lsh, my_hashes = get_lsh(shingle_list=shingles, threshold=threshold, num_perms=n_perm,
                                list_names=names)  # Get lsh and common hashes
    my_lsh_forest = get_lsh_forest(hash_list=my_hashes, num_perms=n_perm,
                                   list_names=names)  # Get lsh forest with hashes

    return my_lsh, my_lsh_forest, my_hashes, names


# EXAMPLE EXECUTION:
# Below is a sample LSH Forest execution that will give you the LSH outputin a forest format based on a given top k and threshold
# NOTE: THE INPUT PATH MUST BE UPDATED TO THE RESPECTIVE CORPUS OF SUSPICIOUS PASSAGES

# input_path = "./31.Suspicious_Passages/corpus"

# my_lsh, my_lsh_forest, my_hashes, names = get_lsh_data(input_path, threshold=.6, shingle_size=4,
#                                                 n_perm=128)  # Get lsh, forest and hashes

# LSH_Model = my_lsh
# LSH_FOREST_Model = my_lsh_forest
# print("regular LSH:", my_lsh.query(my_hashes[0]))           # Get documents above threshold
# print("forest LSH:", my_lsh_forest.query(my_hashes[0], 2))  # Get k document close to hash[x]

# print("Similar Elements based on Threshold of .6 and shingle size 4")
# for i in range(len(my_hashes)):
#     print(names[i], "-> Similar Items ->", my_lsh.query(my_hashes[i]))



## EVALUATION

In [None]:
# EVALUATION

# IF DIFFERENT CORPUS USED, USE DIFFERENT GROUND TRUTH PATH
ground_truth_path = "./31.Suspicious_Passages/ground_truth.tsv"

def get_ground_truth(gt_path):
    """ Gets the ground truth and transforms it into an array

    Returns:
        Array with two elements: Returns a list with the file name and its classification of being plagiarised or not.
        1 if plagiarised else not. 
    """
    path = gt_path

    df = pd.read_csv(path, sep='\t')
    return df.to_numpy()


def pop_confusion_metrics(tt_path, gt_path = "./31.Suspicious_Passages/ground_truth.tsv"):
    """
    Populates the confusion matrix based on true positive, flase postive, false negative and false positive. 

    Args:
        path (String): takes the path to the test truth to then compare to the ground truth

    Returns:
        int, int, int, int: returns true positve, false positive, false negative, true negative
    """
    gt = get_ground_truth(gt_path)
    
    # Transform both sets to dictionaries for O(n) comparison
    gt_dict = {}
    
    for elem in gt:
        gt_dict[elem[0]] = elem[1]
    
    test_truth = binarycsv_transform(tt_path)
    
    test_dict = {}
    for elem in test_truth:
        test_dict[elem[0].split("-")[-1]] = elem[1]
    
    # Comparison to compute false positive, false negative, true positive and true negative
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for elem in gt_dict:
        actual = int(gt_dict[elem])
        predicted = int(test_dict[elem])
        if actual == 1:
            if predicted == 0:
                FN += 1
            elif predicted == 1:
                TP += 1
        elif actual == 0:
            if predicted == 0:
                TN += 1
            elif predicted == 1:
                FP += 1
                
    print("TP is", TP)
    print("FP is", FP)
    print("FN is", FN)
    print("TN is", TN)
    
    return TP, FP, FN, TN


def binarycsv_transform(path):
    """
    Takes the test truth and transforms it into an array

    Args:
        path (_type_): _description_

    Returns:
        _type_: _description_
    """
    csv_array = []
    with open(path, 'r') as file:
        csvreader = csv.reader(file)
        for row in csvreader:
            csv_array.append([row[1], row[2]])
    
    return csv_array[1:]

def prec_acc_comp(tt_path, gt_path):
    """
    Calculates precision, recall, accuracy

    Args:
        path (String): takes the path to the test truth to then compare to the ground truth

    Returns:
        int, int, int: precision, recall, accuracy
    """
    TP, FP, FN, TN = pop_confusion_metrics(tt_path, gt_path)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    acc = (TP+TN)/(TP+TN+FP+FN)
    F1 = (2*precision*recall)/(recall + precision)
    return str(round(precision*100, 2))+'%', \
        str(round(recall*100, 2))+'%', \
        str(round(acc*100, 2))+'%' , \
        str(round(F1*100, 2))+'%'
        
    

# EXAMPLE EXECUTION:
# Below is a sample execution to evaluate the precision, recall and accuracy of our model
# NOTE: THE INPUT PATH MUST BE UPDATED TO THE RESPECTIVE SAVED BINARY OUTPUT FILE

# path = ./Suspicious_Passages/corpusLSHbinary_T03.0_Ksize6_nperm128.csv

    
# print("03 THRESHOLD, ksize 8, nperm 128")
# prec, rec, acc = prec_acc_comp3(path)
# print("|PREC", prec, "|REC:", rec, "|ACC:", acc)
# print(100 * "-")

## CLEAN GROUND TRUTH

In [None]:
# RECOMPUTATION OF GROUND TRUTH

# This section aims to create a better ground truth with respect to files that actually appear in the dataset

corpus_path = "./31.Suspicious_Passages/corpus"
filepath = "./31.Suspicious_Passages/corpus/suspicious-document00014.xml"

# List of all source-document names
dataset_file_names = [doc.split('/')[-1] for doc in get_paths_from_current_dir(corpus_path)]
# List of all xml file names
xml_path_names = [xml_doc for xml_doc in get_paths_from_current_dir(corpus_path, 'xml')]

#References for quick access
ground_truth = []
plagiarism_table = []
speculative_list = []

def get_plagiarized_sources(text_name):
    """
    Obtains the list of artificial insertions in the xml file of a document in the corpus
    :param text_name: string corresponding to name of the source text path
    :return: (list of string) a list of file names linked by plagiarizm to the text path document
    """
    documents = []
    text = open(text_name)

    for line in text:
        line = str(line)
        if "artificial-plagiarism" in str(line):
            doc = line.split('source_reference=')[-1].split(' ')[0]
            documents.append(doc.strip('"'))

    return list(set(documents))


def validate_sources(list_of_sources):
    """
    Checks the real list of sources present in our dataset
    :param list_of_sources: the sources that the ground truth indicates as plagiarism for a file
    :return: (list of string) a list of file names that are actually in our dataset
    """
    existing_files = dataset_file_names
    intersection = set(list_of_sources).intersection(existing_files)
    #if len(intersection) > 0:
    #    print('plagiarized 1')

    return intersection

def get_real_ground_truth():
    """
    creates 2 lists representing the new ground truth in the same format as the dataset one,
    and additionally a new list which precisely indicates which files are the source of the plagiarism
    :return: new_ground_truth (list of string) list of tuples composed of file name and plagiarised index
             actual_plagiarism_table (list of string) list of tuples composed of file name and plagiarised sources
    """
    xml_path_names.sort()
    document_file_names = [x.split('-')[-1].split('.')[0] for x in dataset_file_names]
    document_file_names.sort()

    new_ground_truth = []
    actual_plagiarism_table = []
    for i, xml_path in enumerate(xml_path_names):
        plagiarized_list = get_plagiarized_sources(xml_path)
        validated_list = validate_sources(plagiarized_list)
        actual_plagiarism_table.append([document_file_names[i],
                                        [x.split('/')[-1].split('-')[-1].split('.')[0] for x in validated_list]])
        new_ground_truth.append([document_file_names[i], 1 if len(validated_list) > 0 else 0])

    return new_ground_truth, actual_plagiarism_table

def get_speculative_ground(plagiarism_table):
    """
    Creates new list representing a ground truth that flags plagiarism on both elements source and document
    to better represent our current reasoning for dealing with candidate pairs
    :return: dict_files (dictionary<string, integer>) a dictionary where every element is the file name
        and the corresponding plagiarism index
    """
    document_file_names = [x.split('-')[-1].split('.')[0] for x in dataset_file_names]
    document_file_names.sort()
    dict_files = {}

    # First round of setting 1
    for elem in plagiarism_table:
        dict_files.update({elem[0]: 1 if len(elem[1]) > 0 else 0})

    # Second round for setting 1 even to the sources
    for elem in plagiarism_table:
        for source in elem[1]:
            dict_files[source] = 1

    return dict_files


# EXAMPLE EXECUTION:
# No sample execution as the computed ground truths are in the submission folder. 

# The function was called through the following method and the three tables were computed / created:

def execute():
    actual_ground_truth, plagiarism_table = get_real_ground_truth()

    gt_df = pd.DataFrame(actual_ground_truth, columns=["doc_name", "plagiarism"])
    gt_df.to_csv("actual_ground_truth.tsv", index=False, sep='\t')

    pl_df = pd.DataFrame(plagiarism_table, columns=["doc_name", "plagiarism links"])
    pl_df.to_csv("plagiarism_links.tsv", index=False, sep='\t')

    dict_gt = get_speculative_ground(plagiarism_table)
    speculative_list = []
    [speculative_list.append([elem, dict_gt[elem]]) for elem in dict_gt]

    speculative_gt = pd.DataFrame(speculative_list, columns=["doc_name", "plagiarism"])
    speculative_gt.to_csv("speculative_ground_truth.tsv", index=False, sep='\t')

    return plagiarism_table


## GROUND TRUTH EVALUATION

In [None]:
# Ground truth evaluation

def actual_gt_eval(gt_path):
    """
    Populates the confusion matrix based on true positive, flase postive, false negative and false positive. 

    Args:
        path (String): takes the path to the test truth to then compare to the ground truth

    Returns:
        int, int, int, int: returns true positve, false positive, false negative, true negative
    """
    path = gt_path

    df = pd.read_csv(path, sep='\t')
    
    gt = df.to_numpy()
    
    gt_dict = {}
    
    counter_class1 = 0
    
    for elem in gt:
        gt_dict[elem[0]] = elem[1]
        if int(elem[1]) == 1:
            counter_class1 += 1
    print("Class 1", counter_class1)
    print("'%' class 1", counter_class1 / len(gt_dict))
    
# EXAMPLE EXECUTION:
# Below is a sample execution to evaluate the actual ground truth
# NOTE: the path to the g# EVALUATIONround truth must be adapted to the local directory

# gt_path = "./actual_ground_truth.tsv"
# actual_gt_eval(gt_path)


## EXECUTABLE

In [None]:
def executable(threshold, k_shingle, num_permutations):
  df_LSH_Threshold, df_LSH_Binary, binary_file_name = compute_plagiarism_binary("./31.Suspicious_Passages/corpus", thresh = threshold, shingle_size = k_shingle, num_perms = num_permutations)

  path_given_gt = "./Ground_Truths/ground_truth.tsv"
  path_actual_gt = "./Ground_Truths/actual_ground_truth.tsv"
  path_speculative_gt = "./Ground_Truths/speculative_ground_truth.tsv"

  print("COMPARED TO GIVEN GROUND TRUTH")
  print(threshold, " THRESHOLD", " ksize: ", k_shingle, " nperm:", num_permutations)
  prec, rec, acc, f1 = prec_acc_comp(binary_file_name, path_given_gt)
  print("|PREC", prec, "|REC:", rec, "|ACC:", acc, "|F1:", f1)
  print(100 * "-")

  print("COMPARED TO ACTUAL GROUND TRUTH")
  print(threshold, " THRESHOLD", " ksize: ", k_shingle, " nperm:", num_permutations)
  prec, rec, acc, f1 = prec_acc_comp(binary_file_name, path_actual_gt)
  print("|PREC", prec, "|REC:", rec, "|ACC:", acc, "|F1:", f1)
  print(100 * "-")

  print("COMPARED TO SPECULATIVE GROUND TRUTH")
  print(threshold, " THRESHOLD", " ksize: ", k_shingle, " nperm:", num_permutations)
  prec, rec, acc, f1 = prec_acc_comp(binary_file_name, path_speculative_gt)
  print("|PREC", prec, "|REC:", rec, "|ACC:", acc, "|F1:", f1)
  print(100 * "-")

executable(0.3, 8, 128)