<a href="https://colab.research.google.com/github/A01745336/Evidencia2_Detecci-nDePlagio/blob/main/ModeloSimilitud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from zipfile import ZipFile
import os

In [None]:
# Descomprimir el archivo en la carpeta de
zip_path = '/content/TextosOriginales.zip'
with ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/documents')

# Descomprimir el archivo en la carpeta de
zip_path = '/content/OneDrive_2024-05-02.zip'
with ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/documents')

In [None]:
import os
import numpy as np
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
from nltk import download
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.models import load_model
import string
import joblib
from nltk.stem import WordNetLemmatizer
from scipy.spatial.distance import cityblock, cosine
from scipy.stats import pearsonr
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Descarga de recursos necesarios de NLTK
download('punkt')
download('stopwords')
download('wordnet')  # Asegúrate de tener descargado wordnet

# Inicializar el tokenizador y el modelo BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Ponderaciones ajustables
tfidf_weight = 0.9
bert_weight = 0.1

# Inicialización del escalador
scaler = StandardScaler()

def adjust_features(tfidf_features, bert_features):
    # Ponderar características
    weighted_tfidf = tfidf_features * tfidf_weight
    weighted_bert = bert_features * bert_weight

    # Combinar características normalizadas y ponderadas
    combined_features = np.hstack([weighted_tfidf.toarray(), weighted_bert])
    return combined_features

# Función para aplicar PCA a los embeddings de BERT
def apply_pca(bert_embeddings, n_components=0):
    scaler = StandardScaler()
    bert_scaled = scaler.fit_transform(bert_embeddings)
    pca = PCA(n_components=n_components)
    bert_reduced = pca.fit_transform(bert_scaled)
    return bert_reduced

def read_files_in_directory(directory):
    files_contents = []
    file_names = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    files_contents.append(file.read())
            except UnicodeDecodeError:
                with open(file_path, 'r', encoding='windows-1252') as file:
                    files_contents.append(file.read())
            file_names.append(filename)
    return file_names, files_contents

def preprocess(text):
    # Convertir el texto a minúsculas
    text = text.lower()
    # Eliminar puntuación
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenizar el texto
    tokens = word_tokenize(text)
    # Obtener las stopwords en inglés
    stop_words = set(stopwords.words('english'))
    # Inicializar el lematizador
    lemmatizer = WordNetLemmatizer()
    # Lematizar los tokens que no sean stopwords y sean alfabéticos
    lemmas = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalpha()]
    # Unir los lemas en un solo string y retornarlo
    return ' '.join(lemmas)

def bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

def generate_vector_space_models(original_texts, suspicious_texts, vectorizer):
    processed_original_texts = [preprocess(text) for text in original_texts]
    processed_suspicious_texts = [preprocess(text) for text in suspicious_texts]

    bert_embeddings_original = np.array([bert_embeddings(text) for text in processed_original_texts])
    bert_embeddings_suspicious = np.array([bert_embeddings(text) for text in processed_suspicious_texts])

    # Aplicar PCA a los embeddings de BERT
    bert_embeddings_original_reduced = apply_pca(bert_embeddings_original)
    bert_embeddings_suspicious_reduced = apply_pca(bert_embeddings_suspicious)

    tfidf_matrix_original = vectorizer.transform(processed_original_texts)
    tfidf_matrix_suspicious = vectorizer.transform(processed_suspicious_texts)

    # Combinar características TF-IDF con BERT reducido por PCA
    combined_features_original = np.hstack([tfidf_matrix_original.toarray(), bert_embeddings_original_reduced])
    combined_features_suspicious = np.hstack([tfidf_matrix_suspicious.toarray(), bert_embeddings_suspicious_reduced])

    return combined_features_original, combined_features_suspicious

def find_most_similar(original_features, suspicious_features, original_filenames, suspicious_filenames):
    cosine_similarities = cosine_similarity(suspicious_features, original_features)
    results = []
    for idx, suspicious_name in enumerate(suspicious_filenames):
        # Encuentra el índice del documento original más similar basado en la similitud coseno
        best_match_index = np.argmax(cosine_similarities[idx])
        best_match_name = original_filenames[best_match_index]

        # Calcula la similitud coseno, la distancia de Manhattan y la correlación de Pearson
        cos_similarity_score = cosine_similarities[idx][best_match_index] * 100
        manhattan_similarity_score = 1 / (1 + cityblock(suspicious_features[idx], original_features[best_match_index])) * 100
        pearson_corr, _ = pearsonr(suspicious_features[idx], original_features[best_match_index])
        pearson_similarity_score = (pearson_corr + 1) / 2 * 100  # Normalizar entre 0 y 1 y convertir a porcentaje

        # Guarda los resultados incluyendo las nuevas métricas
        results.append({
            'suspicious_document': suspicious_name,
            'matched_document': best_match_name,
            'cosine_similarity': cos_similarity_score,
            'manhattan_similarity': manhattan_similarity_score,
            'pearson_correlation': pearson_similarity_score
        })
    return results

def predict_plagiarism_type(features, model):
    prediction = model.predict(features)
    predicted_class = np.argmax(prediction, axis=1)
    return predicted_class




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Carga el modelo de clasificación del tipo de plagio
# plagiarism_type_model = load_model('best_model (3).h5')
vectorizer = joblib.load('tfidf_vectorizer (2).joblib')

print("Vocabulario del vectorizador:", len(vectorizer.vocabulary_))


original_dir = '/content/documents/TextosOriginales'
suspicious_dir = '/content/documents/FinalTesting'

original_filenames, original_texts = read_files_in_directory(original_dir)
suspicious_filenames, suspicious_texts = read_files_in_directory(suspicious_dir)

features_original, features_suspicious = generate_vector_space_models(original_texts, suspicious_texts, vectorizer)
match_results = find_most_similar(features_original, features_suspicious, original_filenames, suspicious_filenames)


Vocabulario del vectorizador: 99831


In [None]:
# Clasificar el tipo de plagio para cada documento sospechoso
for result in match_results:
    suspicious = result['suspicious_document']
    original = result['matched_document']
    cos_score = result['cosine_similarity']
    manhattan_score = result['manhattan_similarity']
    pearson_score = result['pearson_correlation']

    suspicious_features_index = suspicious_filenames.index(suspicious)
    suspicious_features = features_suspicious[suspicious_features_index].reshape(1, -1)  # Asegura forma correcta

    # Si necesitas predecir el tipo de plagio con un modelo entrenado
    # predicted_type = plagiarism_type_model.predict(suspicious_features)
    # predicted_type = label_map[np.argmax(predicted_type)]

    print(f"Documento Sospechoso: {suspicious}")
    print(f"Documento Original: {original}")
    print(f"Porcentaje de Similitud Cosine: {cos_score:.2f}%")
    # print(f"Porcentaje de Similitud Manhattan: {manhattan_score:.2f}%")
    # print(f"Porcentaje de Correlación de Pearson: {pearson_score:.2f}%")
    # print(f"Tipo de Plagio: {predicted_type}\n")
    print()


Documento Sospechoso: FID-002.txt
Documento Original: org-063.txt
Porcentaje de Similitud Cosine: 18.47%

Documento Sospechoso: FID-020.txt
Documento Original: org-064.txt
Porcentaje de Similitud Cosine: 7.96%

Documento Sospechoso: FID-026.txt
Documento Original: org-101.txt
Porcentaje de Similitud Cosine: 98.72%

Documento Sospechoso: FID-007.txt
Documento Original: org-061.txt
Porcentaje de Similitud Cosine: 20.43%

Documento Sospechoso: FID-009.txt
Documento Original: org-083.txt
Porcentaje de Similitud Cosine: 13.20%

Documento Sospechoso: FID-024.txt
Documento Original: org-063.txt
Porcentaje de Similitud Cosine: 11.05%

Documento Sospechoso: FID-019.txt
Documento Original: org-078.txt
Porcentaje de Similitud Cosine: 100.00%

Documento Sospechoso: FID-016.txt
Documento Original: org-057.txt
Porcentaje de Similitud Cosine: 98.46%

Documento Sospechoso: FID-018.txt
Documento Original: org-014.txt
Porcentaje de Similitud Cosine: 94.51%

Documento Sospechoso: FID-030.txt
Documento Or