In [None]:
import os
import re
from collections import Counter
import math
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def get_lemmatized_tokens(text):
    lemmatizer = WordNetLemmatizer()
    words = re.findall(r'\b\w+\b', text.lower())
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words

def get_ngram_frequencies(text, n):
    words = get_lemmatized_tokens(text)
    ngrams = [tuple(words[i:i+n]) for i in range(len(words)-n+1)]
    return Counter(ngrams)

def cosine_similarity(frequencies1, frequencies2):
    intersection = set(frequencies1.keys()) & set(frequencies2.keys())

    dot_product = sum(frequencies1[ngram] * frequencies2[ngram] for ngram in intersection)
    magnitude1 = math.sqrt(sum(frequencies1[ngram] ** 2 for ngram in frequencies1.keys()))
    magnitude2 = math.sqrt(sum(frequencies2[ngram] ** 2 for ngram in frequencies2.keys()))

    if magnitude1 == 0 or magnitude2 == 0:
        return 0

    return dot_product / (magnitude1 * magnitude2)

def calculate_similarity(file1, file2, n):
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        text1 = f1.read()
        text2 = f2.read()

        frequencies1 = get_ngram_frequencies(text1, n)
        frequencies2 = get_ngram_frequencies(text2, n)

        similarity = cosine_similarity(frequencies1, frequencies2)
        percentage = round(similarity * 100, 2)

        return percentage

def compare_files(file1, file2_folder, n):
    similarities = []
    
    for file in os.listdir(file2_folder):
        if file.endswith(".txt"):
            file2 = os.path.join(file2_folder, file)
            similarity_percentage = calculate_similarity(file1, file2, n)
            similarities.append((file2, similarity_percentage))
    
    return similarities

file1 = '/content/drive/MyDrive/DATASET_IA/docmentos-sospechosos/FID-02.txt'
folder_path = '/content/drive/MyDrive/DATASET_IA/documentos-genuinos'
ngram_size = 3  
similarities = compare_files(file1, folder_path, ngram_size)

for file, similarity in similarities:
    print(f"Similarity with {file}: {similarity}%")

In [None]:
import glob


file1 = '/content/drive/MyDrive/DATASET_IA/docmentos-sospechosos/FID-15.txt'
folder_path = '/content/drive/MyDrive/DATASET_IA/documentos-genuinos'
ngram_size = 3  
similarities = compare_files(file1, folder_path, ngram_size)

folder_path_gen = "/content/drive/MyDrive/DATASET_IA/docmentos-sospechosos"
file_names_gen = glob.glob(folder_path_gen + "/*")

for fileMy in file_names_gen:
  plagio = "Docuemnto Genuino"
  print(fileMy)
  similarities = compare_files(fileMy, folder_path, ngram_size)
  for file, similarity in similarities:
    if(similarity > 15):
      plagio = "Documento con plagio"
    #print(f"Similarity with {file}: {similarity}%")
  print(plagio)
  print("-------------------------------------------------------------------------")



/content/drive/MyDrive/DATASET_IA/docmentos-sospechosos/FID-12.txt
Docuemnto Genuino
-------------------------------------------------------------------------
/content/drive/MyDrive/DATASET_IA/docmentos-sospechosos/FID-04.txt
Docuemnto Genuino
-------------------------------------------------------------------------
/content/drive/MyDrive/DATASET_IA/docmentos-sospechosos/FID-15.txt
Documento con plagio
-------------------------------------------------------------------------
/content/drive/MyDrive/DATASET_IA/docmentos-sospechosos/FID-14.txt
Docuemnto Genuino
-------------------------------------------------------------------------
/content/drive/MyDrive/DATASET_IA/docmentos-sospechosos/FID-08.txt
Docuemnto Genuino
-------------------------------------------------------------------------
/content/drive/MyDrive/DATASET_IA/docmentos-sospechosos/FID-07.txt
Documento con plagio
-------------------------------------------------------------------------
/content/drive/MyDrive/DATASET_IA/docmen