<a href="https://colab.research.google.com/github/A01745336/Evidencia2_Detecci-nDePlagio/blob/main/ModeloSimilitud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from zipfile import ZipFile
import os

In [2]:
# Descomprimir el archivo en la carpeta de
zip_path = '/content/TextosOriginales.zip'
with ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/documents')

# Descomprimir el archivo en la carpeta de
zip_path = '/content/OneDrive_2024-05-02.zip'
with ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/documents')

In [4]:
zip_path = '/content/OneDrive_2024-05-02.zip'
with ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/documents')

In [5]:
import os
import numpy as np
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
from nltk import download
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.models import load_model
import string
import joblib
from nltk.stem import WordNetLemmatizer
from scipy.spatial.distance import cityblock, cosine
from scipy.stats import pearsonr
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Descarga de recursos necesarios de NLTK
download('punkt')
download('stopwords')
download('wordnet')  # Asegúrate de tener descargado wordnet

# Inicializar el tokenizador y el modelo BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Ponderaciones ajustables
tfidf_weight = 0.9
bert_weight = 0.1

# Inicialización del escalador
scaler = StandardScaler()

def adjust_features(tfidf_features, bert_features):
    # Ponderar características
    weighted_tfidf = tfidf_features * tfidf_weight
    weighted_bert = bert_features * bert_weight

    # Combinar características normalizadas y ponderadas
    combined_features = np.hstack([weighted_tfidf.toarray(), weighted_bert])
    return combined_features

# Función para aplicar PCA a los embeddings de BERT
def apply_pca(bert_embeddings, n_components=0):
    scaler = StandardScaler()
    bert_scaled = scaler.fit_transform(bert_embeddings)
    pca = PCA(n_components=n_components)
    bert_reduced = pca.fit_transform(bert_scaled)
    return bert_reduced

def read_files_in_directory(directory):
    files_contents = []
    file_names = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    files_contents.append(file.read())
            except UnicodeDecodeError:
                with open(file_path, 'r', encoding='windows-1252') as file:
                    files_contents.append(file.read())
            file_names.append(filename)
    return file_names, files_contents

def preprocess(text):
    # Convertir el texto a minúsculas
    text = text.lower()
    # Eliminar puntuación
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenizar el texto
    tokens = word_tokenize(text)
    # Obtener las stopwords en inglés
    stop_words = set(stopwords.words('english'))
    # Inicializar el lematizador
    lemmatizer = WordNetLemmatizer()
    # Lematizar los tokens que no sean stopwords y sean alfabéticos
    lemmas = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalpha()]
    # Unir los lemas en un solo string y retornarlo
    return ' '.join(lemmas)

def bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

def generate_vector_space_models(original_texts, suspicious_texts, vectorizer):
    processed_original_texts = [preprocess(text) for text in original_texts]
    processed_suspicious_texts = [preprocess(text) for text in suspicious_texts]

    bert_embeddings_original = np.array([bert_embeddings(text) for text in processed_original_texts])
    bert_embeddings_suspicious = np.array([bert_embeddings(text) for text in processed_suspicious_texts])

    # Aplicar PCA a los embeddings de BERT
    bert_embeddings_original_reduced = apply_pca(bert_embeddings_original)
    bert_embeddings_suspicious_reduced = apply_pca(bert_embeddings_suspicious)

    tfidf_matrix_original = vectorizer.transform(processed_original_texts)
    tfidf_matrix_suspicious = vectorizer.transform(processed_suspicious_texts)

    # Combinar características TF-IDF con BERT reducido por PCA
    combined_features_original = np.hstack([tfidf_matrix_original.toarray(), bert_embeddings_original_reduced])
    combined_features_suspicious = np.hstack([tfidf_matrix_suspicious.toarray(), bert_embeddings_suspicious_reduced])

    return combined_features_original, combined_features_suspicious

def find_most_similar(original_features, suspicious_features, original_filenames, suspicious_filenames):
    cosine_similarities = cosine_similarity(suspicious_features, original_features)
    results = []
    for idx, suspicious_name in enumerate(suspicious_filenames):
        # Encuentra el índice del documento original más similar basado en la similitud coseno
        best_match_index = np.argmax(cosine_similarities[idx])
        best_match_name = original_filenames[best_match_index]

        # Calcula la similitud coseno, la distancia de Manhattan y la correlación de Pearson
        cos_similarity_score = cosine_similarities[idx][best_match_index] * 100
        manhattan_similarity_score = 1 / (1 + cityblock(suspicious_features[idx], original_features[best_match_index])) * 100
        pearson_corr, _ = pearsonr(suspicious_features[idx], original_features[best_match_index])
        pearson_similarity_score = (pearson_corr + 1) / 2 * 100  # Normalizar entre 0 y 1 y convertir a porcentaje

        # Guarda los resultados incluyendo las nuevas métricas
        results.append({
            'suspicious_document': suspicious_name,
            'matched_document': best_match_name,
            'cosine_similarity': cos_similarity_score,
        })
    return results

def predict_plagiarism_type(features, model):
    prediction = model.predict(features)
    predicted_class = np.argmax(prediction, axis=1)
    return predicted_class




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [6]:
# Carga el modelo de clasificación del tipo de plagio
# plagiarism_type_model = load_model('best_model (3).h5')
vectorizer = joblib.load('tfidf_vectorizer (2).joblib')

print("Vocabulario del vectorizador:", len(vectorizer.vocabulary_))


original_dir = '/content/documents/TextosOriginales'
suspicious_dir = '/content/documents/FinalTesting'

original_filenames, original_texts = read_files_in_directory(original_dir)
suspicious_filenames, suspicious_texts = read_files_in_directory(suspicious_dir)

features_original, features_suspicious = generate_vector_space_models(original_texts, suspicious_texts, vectorizer)
match_results = find_most_similar(features_original, features_suspicious, original_filenames, suspicious_filenames)


Vocabulario del vectorizador: 99831


In [7]:
# Clasificar el tipo de plagio para cada documento sospechoso
for result in match_results:
    suspicious = result['suspicious_document']
    original = result['matched_document']
    cos_score = result['cosine_similarity']

    suspicious_features_index = suspicious_filenames.index(suspicious)
    suspicious_features = features_suspicious[suspicious_features_index].reshape(1, -1)  # Asegura forma correcta

    print(f"Documento Sospechoso: {suspicious}")
    print(f"Documento Original: {original}")
    print(f"Porcentaje de Similitud Cosine: {cos_score:.2f}%")
    print()


Documento Sospechoso: FID-027.txt
Documento Original: org-067.txt
Porcentaje de Similitud Cosine: 98.64%

Documento Sospechoso: FID-023.txt
Documento Original: org-023.txt
Porcentaje de Similitud Cosine: 98.86%

Documento Sospechoso: FID-030.txt
Documento Original: org-082.txt
Porcentaje de Similitud Cosine: 17.00%

Documento Sospechoso: FID-008.txt
Documento Original: org-063.txt
Porcentaje de Similitud Cosine: 15.45%

Documento Sospechoso: FID-015.txt
Documento Original: org-082.txt
Porcentaje de Similitud Cosine: 20.40%

Documento Sospechoso: FID-018.txt
Documento Original: org-014.txt
Porcentaje de Similitud Cosine: 94.51%

Documento Sospechoso: FID-024.txt
Documento Original: org-063.txt
Porcentaje de Similitud Cosine: 11.05%

Documento Sospechoso: FID-006.txt
Documento Original: org-082.txt
Porcentaje de Similitud Cosine: 19.16%

Documento Sospechoso: FID-003.txt
Documento Original: org-079.txt
Porcentaje de Similitud Cosine: 7.55%

Documento Sospechoso: FID-005.txt
Documento Ori

In [12]:
print(match_results)

[{'suspicious_document': 'FID-027.txt', 'matched_document': 'org-067.txt', 'cosine_similarity': 98.63720389604073}, {'suspicious_document': 'FID-023.txt', 'matched_document': 'org-023.txt', 'cosine_similarity': 98.85712357966364}, {'suspicious_document': 'FID-030.txt', 'matched_document': 'org-082.txt', 'cosine_similarity': 17.004343582002225}, {'suspicious_document': 'FID-008.txt', 'matched_document': 'org-063.txt', 'cosine_similarity': 15.449653189332437}, {'suspicious_document': 'FID-015.txt', 'matched_document': 'org-082.txt', 'cosine_similarity': 20.39851453495854}, {'suspicious_document': 'FID-018.txt', 'matched_document': 'org-014.txt', 'cosine_similarity': 94.51351016877582}, {'suspicious_document': 'FID-024.txt', 'matched_document': 'org-063.txt', 'cosine_similarity': 11.052487888528875}, {'suspicious_document': 'FID-006.txt', 'matched_document': 'org-082.txt', 'cosine_similarity': 19.160820242794102}, {'suspicious_document': 'FID-003.txt', 'matched_document': 'org-079.txt', '

In [32]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Tus datos iniciales
ground_truth = {
        'FID-001.txt': False,
        'FID-002.txt': False,
        'FID-003.txt': False,
        'FID-004.txt': False,
        'FID-005.txt': True,
        'FID-006.txt': False,
        'FID-007.txt': False,
        'FID-008.txt': False,
        'FID-009.txt': False,
        'FID-010.txt': True,
        'FID-011.txt': False,
        'FID-012.txt': False,
        'FID-013.txt': False,
        'FID-014.txt': False,
        'FID-015.txt': False,
        'FID-016.txt': True,
        'FID-017.txt': False,
        'FID-018.txt': True,
        'FID-019.txt': True,
        'FID-020.txt': False,
        'FID-021.txt': False,
        'FID-022.txt': True,
        'FID-023.txt': True,
        'FID-024.txt': False,
        'FID-025.txt': False,
        'FID-026.txt': True,
        'FID-027.txt': True,
        'FID-028.txt': False,
        'FID-029.txt': True,
        'FID-030.txt': False
}


# Extraer y procesar resultados
predicted_results = {}
similarity_scores = {}
umbral = 30.0  # Umbral de similitud del coseno para considerar un documento como plagio

for result in match_results:
    doc_id = result['suspicious_document']
    similarity = result['cosine_similarity']
    predicted_results[doc_id] = (similarity > umbral)
    similarity_scores[doc_id] = similarity / 100.0  # Escalar de 0 a 1

# Preparar datos para métricas
y_pred = [1 if predicted_results.get(doc_id, 0) else 0 for doc_id in ground_truth]
y_true = [1 if ground_truth[doc_id] else 0 for doc_id in ground_truth]
y_scores = [similarity_scores.get(doc_id, 0) for doc_id in ground_truth]

# Calcular métricas
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_scores)
conf_matrix = confusion_matrix(y_true, y_pred)

conf_df = pd.DataFrame(conf_matrix, index=['Actual Positivo', 'Actual Negativo'], columns=['Predicción Positivo', 'Predicción Negativo'])

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC (ROC):", auc)
print("Confusion Matrix:\n", conf_df)



Precision: 1.0
Recall: 1.0
F1 Score: 1.0
AUC (ROC): 1.0
Confusion Matrix:
                  Predicción Positivo  Predicción Negativo
Actual Positivo                   20                    0
Actual Negativo                    0                   10


In [27]:
  import numpy as np
  import pandas as pd
  from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
  from sklearn.preprocessing import LabelEncoder

  # Tus datos iniciales
  ground_truth = {
          'FID-001.txt': 'Ninguno',
          'FID-002.txt': 'Ninguno',
          'FID-003.txt': 'Ninguno',
          'FID-004.txt': 'Ninguno',
          'FID-005.txt': 'org-059.txt',
          'FID-006.txt': 'Ninguno',
          'FID-007.txt': 'Ninguno',
          'FID-008.txt': 'Ninguno',
          'FID-009.txt': 'Ninguno',
          'FID-010.txt': 'org-091.txt',
          'FID-011.txt': 'Ninguno',
          'FID-012.txt': 'Ninguno',
          'FID-013.txt': 'Ninguno',
          'FID-014.txt': 'Ninguno',
          'FID-015.txt': 'Ninguno',
          'FID-016.txt': 'org-057.txt',
          'FID-017.txt': 'Ninguno',
          'FID-018.txt': 'org-014.txt',
          'FID-019.txt': 'org-078.txt',
          'FID-020.txt': 'Ninguno',
          'FID-021.txt': 'Ninguno',
          'FID-022.txt': 'org-020.txt',
          'FID-023.txt': 'org-023.txt',
          'FID-024.txt': 'Ninguno',
          'FID-025.txt': 'Ninguno',
          'FID-026.txt': 'org-101.txt',
          'FID-027.txt': 'org-067.txt',
          'FID-028.txt': 'Ninguno',
          'FID-029.txt': 'org-109.txt',
          'FID-030.txt': 'Ninguno'
  }


  # Crear listas para calcular métricas
  y_true = []
  y_pred = []

  for doc, expected_match in ground_truth.items():
      # Encuentra el documento emparejado en los resultados, si existe
      actual_match = next((item['matched_document'] for item in match_results if item['suspicious_document'] == doc), None)
      # Añadir a las listas para comparación
      y_true.append(expected_match)
      y_pred.append(expected_match == actual_match)  # True si coincide, False si no

  # Convertir valores booleanos a 0 o 1 para calcular las métricas
  y_true_bin = [1] * len(y_true)  # Todos los casos son verdaderos en y_true porque esperamos coincidencias exactas
  y_pred_bin = [1 if x else 0 for x in y_pred]

  # Calcular métricas
  precision = precision_score(y_true_bin, y_pred_bin)
  recall = recall_score(y_true_bin, y_pred_bin)
  f1 = f1_score(y_true_bin, y_pred_bin)
  accuracy = accuracy_score(y_true_bin, y_pred_bin)

  # Calcular la matriz de confusión
  conf_matrix = confusion_matrix(y_true_bin, y_pred_bin, labels=[1, 0])

  # Crear DataFrame para visualizar la matriz de confusión
  conf_df = pd.DataFrame(conf_matrix, index=['Actual Positivo', 'Actual Negativo'], columns=['Predicción Positivo', 'Predicción Negativo'])

  print("Precision:", precision)
  print("Recall:", recall)
  print("F1 Score:", f1)
  print("Accuracy:", accuracy)
  print("Matriz de Confusión:\n", conf_df)



Precision: 1.0
Recall: 0.3333333333333333
F1 Score: 0.5
Accuracy: 0.3333333333333333
Matriz de Confusión:
                  Predicción Positivo  Predicción Negativo
Actual Positivo                   10                   20
Actual Negativo                    0                    0
