In [None]:
import joblib
from pygments.lexers import JavaLexer
from pygments import lex

In [30]:
def extract_tokens(code):
    """
    Extracts all tokens from the given Java code using Pygments.
    
    Args:
        code (str): Java code as a string.
        
    Returns:
        str: All tokens extracted from the code, joined by spaces.
    """
    lexer = JavaLexer()
    tokens = []
    for ttype, value in lex(code, lexer):
        if not str(ttype).startswith('Token.Text.Whitespace'):
            val = value.strip()
            if val:
                tokens.append(f"{ttype}:{val}")
    return " ".join(tokens)

In [None]:
def predict_similarity(file1, file2, model_path, vectorizer_path, threshold = 0.50):
    """
    Predice la similitud entre dos archivos de código Java.

    Args:
        file1 (str): Ruta al primer archivo Java.
        file2 (str): Ruta al segundo archivo Java.
        model_path (str): Ruta al modelo entrenado.
        vectorizer_path (str): Ruta al vectorizador entrenado.

    Returns:
        tuple: (score, is_similar)
    """
    try:
        model = joblib.load(model_path)
        vectorizer = joblib.load(vectorizer_path)
        
        with open(file1, 'r', encoding='utf-8') as f:
            code1 = f.read()
        
        with open(file2, 'r', encoding='utf-8') as f:
            code2 = f.read()
        
        if len(code1.strip()) < 10 or len(code2.strip()) < 10:
            return 0.0, False
        
        t1 = extract_tokens(code1)
        t2 = extract_tokens(code2)
        token_pair = f"{t1} {t2}"
        
        X = vectorizer.transform([token_pair])
        
        X_features = X.toarray()
        
        proba = model.predict_proba(X_features)[0]
        similarity_score = proba[1] if len(proba) > 1 else proba[0]
    
        is_similar = similarity_score >= threshold
        
        return similarity_score, is_similar
    
    except Exception as e:
        print(f"Error en la predicción: {str(e)}")
        return None, False

In [None]:
   
file1 = "original.java"
file2 = "plagiarized.java"
    
model_path = "mlp_model.pkl"
vectorizer_path = "mlp_model_vectorizer.pkl"
    
score, is_similar = predict_similarity(file1, file2, model_path, vectorizer_path)
    
print("\nRESULTS")
print(f"FILE 1: {file1}")
print(f"FILE 2: {file2}")
print(f"SIMILARITY: {score:.4f}")
print(f"PLAGIARISM DETECTED: {'YES' if is_similar else 'NO'}")


=== RESULTADOS DE LA DETECCIÓN DE PLAGIO ===
Archivo 1: original.java
Archivo 2: plagiarized.java
Puntaje de similitud: 0.9999 (0-1)
Plagio detectado: SÍ

INFORMACIÓN ADICIONAL:
- Tamaño del archivo 1: 128 bytes
- Tamaño del archivo 2: 128 bytes
- Modelo utilizado: mlp_model.pkl

RESULTADO_JSON: {'score': 0.9999446264537352, 'is_plagiarism': True}
