In [None]:
# Used packages
from preprocess import preprocessing
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import os
import seaborn as sns
from preprocess import preprocessing
from preparation import preparation

In [None]:
'''
Plots the confussion matrix from our system results

Receives a vector with the system results and another one with the actual results
Does not return anything but it displays the confussion matrix
'''
def print_confussion_matrix(y_test, y_pred):
    sns.heatmap((confusion_matrix(y_test,y_pred)), annot=True, fmt="d",cmap="crest")
    plt.title("Confusion matrix")
    plt.ylabel('True class')
    plt.xlabel('Predicted class')
    plt.show()

'''
Builds a dictionary with the word frequency counts of a string

Receives a preprocessed text
Returns a dictionary with the word frequency counts
'''
def build_word_frequency_histogram(preprocessed_str):
    wordfreq = {}

    # If word has already been found, add 1 to its count. If not, add the word to the dictionary
    for word in preprocessed_str.split():
        if word not in wordfreq:
            wordfreq[word] = 0
        wordfreq[word] += 1
    return wordfreq

In [None]:
''' 
Creates subplots and displays all bar charts of histograms

Receives all the system results
Does not return anything but it displays all the histograms, ROC Curve and Confussion Matrix
'''
def create_subplots(maxhistograms, maxfilename, fpr, tpr, actual_results, system_results):
    fig, axs = plt.subplots(len(maxhistograms)+1, figsize=(10, 70))
    fig.tight_layout(pad=10)
    for i, histogram in enumerate(maxhistograms):
        categories = list(histogram.keys())[:20]
        frequencies = list(histogram.values())[:20]
        axs[i].bar(categories, frequencies)            
        axs[i].set_xlabel('Categories')
        axs[i].set_ylabel('Frequency')
        axs[i].set_title(f'Word frequency histogram for {maxfilename[i]}')
        axs[i].tick_params(axis='x', rotation=90)

    roc_auc = metrics.auc(fpr, tpr)
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.title('ROC curve')
    plt.xlabel("Error")
    plt.ylabel("True Positive Rate")
    plt.show()
    print_confussion_matrix(actual_results, system_results)

In [None]:
# Stores all the plagiarism results for every comparison
dictPlag = {}

# Actual results of our tests. 1 = Plagiarism, 0 = Original
actual_results = [1,1,0,0,1,1,0,0,0,0,0,0,0,0,1]

# Results found by our system when doing the plagiarism analysis
system_results = []

'''
Main decision function 
'''
def decision(filePathOriginals, filePathSuspicious):
   
    # Preprocessing original texts
    original_texts = [file for file in os.listdir(filePathOriginals) if os.path.isfile(os.path.join(filePathOriginals, file))]
    processed_original_texts = []
    for original_text in original_texts:
        processed_original_texts.append(preprocessing(filePathOriginals + "/" + original_text))

    print("Starting plagiarism detection...")
    print("\n")
    # Preprocessing suspicious texts
    suspicious_texts = [file for file in os.listdir(filePathSuspicious) if os.path.isfile(os.path.join(filePathSuspicious, file))]
    processed_suspicious_texts = []
    for suspicious_text in suspicious_texts:
        processed_suspicious_texts.append(preprocessing(filePathSuspicious + "/" + suspicious_text))
       
    # Creating histograms for suspicious texts
    maxplagiarism = [0 for _ in range(len(processed_suspicious_texts))]
    maxhistograms = [{} for _ in range(len(processed_suspicious_texts))]
    maxfilename = ["" for _ in range(len(processed_suspicious_texts))]
    
    # Comparing suspicious text with original texts
    for k, processed_suspicious_text in enumerate(processed_suspicious_texts):
        print("Suspicious text: ", suspicious_text)
        plagiarized_check = False

        # Comparing suspicious text with original texts
        for i, processed_original_text in enumerate(processed_original_texts):
            unigram_result, trigram_result = preparation(processed_suspicious_text, processed_original_text)

            # Check if plagiarism is detected
            if unigram_result > 0.15 and trigram_result > 0.15:
                print("Plagiarism detected in file: ", original_texts[i])
                print("⚠️ ⚠️ ⚠️ ⚠️ ⚠️ ⚠️")
                print("\n")
                
                if dictPlag.get(suspicious_text) is None:
                    dictPlag[suspicious_text] = [[original_texts[i],unigram_result,trigram_result]]
                else:
                    dictPlag[suspicious_text].append([original_texts[i], unigram_result,trigram_result])
                    
                # When plagiarism is detected, append 1 to system results
                if not plagiarized_check:
                    system_results.append(1)
                    plagiarized_check = True
                
                # Check if plagiarism is the highest detected
                if maxplagiarism[k] <= unigram_result:
                    maxplagiarism[k] = unigram_result
                    maxfilename[k] = suspicious_texts[k]
                    maxhistograms[k] = build_word_frequency_histogram(processed_suspicious_text)
                    
        # When there is no plagiarism, append 0 to system results
        if not plagiarized_check:
            system_results.append(0)


    # Remove empty histograms
    maxhistograms = [histogram for histogram in maxhistograms if histogram]
    maxfilename = [filename for filename in maxfilename if filename]

    # Print results
    print(f'Predicted Results: {system_results}')
    print(f'Actual Results: {actual_results}')
    fpr, tpr, thresholds = metrics.roc_curve(actual_results, system_results, pos_label=1)
    print("False Positive Rate: ", fpr)
    print("True Positive Rate: ", tpr)
    print("AUC:", metrics.auc(fpr, tpr))

    
    create_subplots(maxhistograms, maxfilename, fpr, tpr, actual_results, system_results)
    


In [None]:
decision("original_files", "suspicious_files")
