In [1]:

from preprocess import preprocessing
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import os
import seaborn as sns
from preparation import cosSimilarity
from model import word_occurrences_in_vectors

In [2]:
#Imprime matriz de confusión
def print_confussion_matrix(y_test, y_pred):
    sns.heatmap((confusion_matrix(y_test,y_pred)), annot=True, fmt="d",cmap="crest")
    plt.title("Confusion matrix")
    plt.ylabel('True class')
    plt.xlabel('Predicted class')
    plt.show()

def print_roc(fpr, tpr, roc_auc):
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.title('ROC curve')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")

In [3]:
def decision(file_path_originals, file_path_suspicious, actual_results):
    import os
    from collections import Counter

    system_results = []
    vectors= [ ] # En este vector se guardan las duplas de vectores

    # Preproceso de textos originales
    original_texts = [file for file in os.listdir(file_path_originals) if os.path.isfile(os.path.join(file_path_originals, file))]
    processed_original_texts = []
    for original_text in original_texts:
        text = preprocessing(os.path.join(file_path_originals, original_text))
        print(original_text + ": ", text)
        processed_original_texts.append(text)


    # Preproceso de textos sospechosos
    suspicious_texts = [file for file in os.listdir(file_path_suspicious) if os.path.isfile(os.path.join(file_path_suspicious, file))]
    processed_suspicious_texts = []
    for suspicious_text in suspicious_texts:
        text = preprocessing(os.path.join(file_path_suspicious, suspicious_text))
        print(suspicious_text + ": ", text)
        processed_suspicious_texts.append(text)

    # Se hace la comparación entre los 2 textos
    for k, processed_suspicious_text in enumerate(processed_suspicious_texts):
        print("Texto sospechoso: ", suspicious_texts[k])
        plagiarized_check = False
        suspicious_plagiarsim_words = 0
        suspicious_embeddings = []
        # Aplanar la lista de palabras sospechosas
        flattened_suspicious_text = [word for sublist in processed_suspicious_text for word in sublist]

        for i in range(len(flattened_suspicious_text)):
            counts_vector1, counts_vector2 = word_occurrences_in_vectors(processed_original_texts[0], flattened_suspicious_text[i])
            # Append counts_vector2 to the list of embeddings
            suspicious_embeddings.append(counts_vector2)
            vectors.append((counts_vector1, counts_vector2))
            print("susCount1",counts_vector1)
            print("ogCount1",counts_vector2)
            #print("\n\nLa lista de Vectores es esta: \n")
            #print (vectors)

        for i, processed_original_text in enumerate(processed_original_texts):
            counts_vector1, counts_vector2 = word_occurrences_in_vectors(processed_original_text, flattened_suspicious_text)
            word_count_plagiarism = cosSimilarity(vectors)
            plagiarism_original_word_count = word_count_plagiarism/len(".".join(flattened_suspicious_text).split())
            vectors.append((counts_vector1, counts_vector2))  # Append the vectors to the list

            print("susCount2",counts_vector1)
            print("ogCount2",counts_vector2)
            # Checa si hay plagio
            # print("La lista de Vectores es esta: \n")
            #print (vectors)
            # ! Modificar           
            if word_count_plagiarism > 0 and (plagiarism_original_word_count*100 >25):
                print("\tPlagio detectado en: ", original_texts[i])
                print(f'\tPorcentaje de plagio: {plagiarism_original_word_count*100:.1f}%')
                print("\t\n")
                suspicious_plagiarsim_words += word_count_plagiarism
                
                if not plagiarized_check:
                    system_results.append(1)
                    plagiarized_check = True


        if not plagiarized_check:
            print("\tNo se encontro plagio\n")
            system_results.append(0)

        if suspicious_plagiarsim_words/len(".".join(flattened_suspicious_text).split()) > 0.15:
            percentaje_plagiarism = suspicious_plagiarsim_words/len(".".join(flattened_suspicious_text).split())*100
            if percentaje_plagiarism > 100:
                percentaje_plagiarism = 100
            print(f'\tHay un total de plagio en {suspicious_texts[k]} de: {percentaje_plagiarism:.1f}%')
            print("\t\n")

    # Impresión de tablas
    tn, fp, fn, tp = confusion_matrix(actual_results, system_results).ravel()
    print(f'Predicted Results: {system_results}')
    print(f'Actual Results: {actual_results}')
    fpr, tpr, thresholds = metrics.roc_curve(actual_results, system_results, pos_label=1)
    print(f'True Positive: {tp}')
    print(f'False Positive: {fp}')
    print(f'True Negative: {tn}')
    print(f'False Negative: {fn}')
    print("False Positive Rate: ", fp/(fp+tn))
    print("True Positive Rate: ", tp/(tp+fn))
    print("AUC:", metrics.auc(fpr, tpr))

    print_confussion_matrix(actual_results, system_results)
    print_roc(fpr, tpr, metrics.auc(fpr, tpr))
    
 


In [4]:
decision("original_files", "suspicious_files", [0,0,0,1,1,0,0,0,1,1,0,0,0,1,1,0,0,0,1,1,0,0,0,1,1,0,0,0,1,1,0,0,0,1,1])

T1.java:  ['public', 'class', 'T1', '{', 'public', 'static', 'void', 'MAIN', '(', 'STRING', '[', ']', 'ARGS', ')', '{', 'SYSTEM', '.', 'OUT', '.', 'PRINTLN', '(', 'String', ')', ';', 'SYSTEM', '.', 'OUT', '.', 'PRINTLN', '(', 'String', ')', ';', 'SYSTEM', '.', 'OUT', '.', 'PRINTLN', '(', 'String', ')', ';', 'SYSTEM', '.', 'OUT', '.', 'PRINTLN', '(', 'String', ')', ';', 'SYSTEM', '.', 'OUT', '.', 'PRINTLN', '(', 'String', ')', ';', '}', '}']
T2.java:  ['import', 'JAVA', '.', 'UTIL', '.', 'SCANNER', ';', 'public', 'class', 'T2', '{', 'public', 'static', 'void', 'MAIN', '(', 'STRING', '[', ']', 'ARGS', ')', '{', 'SCANNER', 'INPUT', '=', 'new', 'SCANNER', '(', 'SYSTEM', '.', 'IN', ')', ';', 'SYSTEM', '.', 'OUT', '.', 'PRINT', '(', 'String', ')', ';', 'double', 'RADIUS', '=', 'INPUT', '.', 'NEXTDOUBLE', '(', ')', ';', 'double', 'LENGTH', '=', 'INPUT', '.', 'NEXTDOUBLE', '(', ')', ';', 'double', 'AREA', '=', 'RADIUS', '*', 'RADIUS', '*', 'Numeric', ';', 'double', 'VOLUME', '=', 'AREA', '*', 

ValueError: operands could not be broadcast together with shapes (2,21) (2,20) 