In [277]:
import joblib
from pygments.lexers import JavaLexer
from pygments import lex

In [278]:
def extract_tokens(code):
    """
    Extracts all tokens from the given Java code using Pygments.
    
    Args:
        code (str): Java code as a string.
        
    Returns:
        str: All tokens extracted from the code, joined by spaces.
    """
    lexer = JavaLexer()
    tokens = []
    for ttype, value in lex(code, lexer):
        if not str(ttype).startswith('Token.Text.Whitespace'):
            val = value.strip()
            if val:
                tokens.append(f"{ttype}:{val}")
    return " ".join(tokens)

In [279]:
def predict_similarity(file1, file2, model_path, vectorizer_path, threshold = 0.50):
    """
    Predicts the similarity between two Java code files using a pre-trained model.
    
    Args:
        file1 (str): Path to the first Java code file.
        file2 (str): Path to the second Java code file.
        model_path (str): Path to the pre-trained model.
        vectorizer_path (str): Path to the vectorizer used for feature extraction.
        threshold (float): Similarity threshold for classification.
        
    Returns:
        tuple: A tuple containing the similarity score and a boolean indicating if the files are similar.
    """
    
    model = joblib.load(model_path)
    vectorizer = joblib.load(vectorizer_path)
    
    with open(file1, 'r', encoding='utf-8') as f:
        code1 = f.read()
    
    with open(file2, 'r', encoding='utf-8') as f:
        code2 = f.read()
    
    if len(code1.strip()) < 10 or len(code2.strip()) < 10:
        return 0.0, False
    
    t1 = extract_tokens(code1)
    t2 = extract_tokens(code2)
    token_pair = f"{t1} {t2}"
    
    X = vectorizer.transform([token_pair])
    
    X_features = X.toarray()
    
    proba = model.predict_proba(X_features)[0]
    similarity_score = proba[1] if len(proba) > 1 else proba[0]

    is_similar = similarity_score >= threshold
    
    return similarity_score, is_similar
    

In [280]:
   
file1 = "original.java"
file2 = "plagiarized.java"
    
model_path = "rf_model.pkl"
vectorizer_path = "vectorizer.pkl"
    
score, is_similar = predict_similarity(file1, file2, model_path, vectorizer_path)
    
print("\nRESULTS")
print(f"SIMILARITY: {score:.4f}")
print(f"PLAGIARISM DETECTED: {'YES' if is_similar else 'NO'}")


RESULTS
SIMILARITY: 0.5087
PLAGIARISM DETECTED: YES
