# Code plagirism detector

Fermin Mendez A01703366

Adrian Matute A01703889

Alan Razo A01703350

## Import libraries

In [50]:
import pandas as pd
import os
import glob
import string
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import javalang
from difflib import SequenceMatcher


## Load example data set

In [51]:
code_1 = r"""
public class PalindromeChecker {
    public static boolean isPalindrome(String input) {
        String clean = input.replaceAll("\\s+", "").toLowerCase();
        String reversed = new StringBuilder(clean).reverse().toString();
        return clean.equals(reversed);
    }

    public static void main(String[] args) {
        String test = "A man a plan a canal Panama";
        if (isPalindrome(test)) {
            System.out.println("\"" + test + "\" is a palindrome.");
        } else {
            System.out.println("\"" + test + "\" is not a palindrome.");
        }
    }
}
"""

code_2 = r"""
public class MirrorStringValidator {
    public static boolean checkPalindrome(String phrase) {
        String formatted = phrase.replaceAll("\\s+", "").toLowerCase();
        String backwards = new StringBuilder(formatted).reverse().toString();

        if (formatted.equals(backwards)) {
            return true;
        }
        return false;
    }

    public static void main(String[] args) {
        String sample = "A man a plan a canal Panama";
        boolean result = checkPalindrome(sample);

        if (result == true) {
            System.out.println("Palindrome confirmed.");
        } else {
            System.out.println("This is not a palindrome.");
        }
    }
}
"""

In [52]:
reserved_words_in_java = [
    "abstract", "assert", "boolean", "break", "byte", "case", "catch", "char",
    "class", "const", "continue", "default", "do", "double", "else", "enum",
    "extends", "final", "finally", "float", "for", "goto", "if", "implements",
    "import", "instanceof", "int", "interface", "long", "native", "new",
    "null", "package", "private", "protected", "public", "return",
    "short", "static", "strictfp", "super", "switch", "synchronized",
    "this", "throw", "throws", "transient", "try", "void",
    "volatile",  # Added 'volatile' to the list
    # 'while' is not included as it is not a reserved word in Java
]

## Auxiliar functions

In [53]:
def clearCommentsinJavaCode(code):
    """
    This function removes comments from Java code.
    """
    # Remove single-line comments
    code = '\n'.join([line.split('//')[0] for line in code.split('\n')])
    
    # Remove multi-line comments
    while '/*' in code and '*/' in code:
        start = code.index('/*')
        end = code.index('*/', start) + 2
        code = code[:start] + code[end:]
    
    return code

In [54]:
def vectorizeJavaCode(code, reserved_words_in_java):
    code = clearCommentsinJavaCode(code)
    code = code.lower()
    code = re.sub(r'[^\w\s]', '', code)  # Remove punctuation
    tokens = code.split()
    filtered = [token for token in tokens if token not in reserved_words_in_java]
    return ' '.join(filtered)

# Process example java codes
code_1_vectorized = vectorizeJavaCode(code_1, reserved_words_in_java)
code_2_vectorized = vectorizeJavaCode(code_2, reserved_words_in_java)

# Vectorize the code snippets
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([code_1_vectorized, code_2_vectorized])
similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
print(f"Cosine Similarity between code_1 and code_2: {similarity[0][0]:.4f}")
    
    

Cosine Similarity between code_1 and code_2: 0.3212


## Approach 2. AST 

In [55]:
def parse_java_code(code):
    try:
        tokens = javalang.tokenizer.tokenize(code)
        parser = javalang.parser.Parser(tokens)
        tree = parser.parse()
        return tree
    except Exception as e:
        print(f"Error parsing code: {e}")
        return None

In [None]:
def extract_ast_features(tree):
    features = []
    for path, node in tree:
        features.append(str(type(node).__name__))
    return features

In [57]:
def ast_similarity(code1, code2):
    tree1 = parse_java_code(code1)
    tree2 = parse_java_code(code2)
    
    if not tree1 or not tree2:
        return 0.0
    
    features1 = extract_ast_features(tree1)
    features2 = extract_ast_features(tree2)
    
    # Usar SequenceMatcher para comparar secuencias de nodos
    similarity = SequenceMatcher(None, features1, features2).ratio()
    return similarity

In [None]:
def ast_similarity_analysis(code1, code2):
    # Parsear códigos
    tree1 = parse_java_code(code1)
    tree2 = parse_java_code(code2)
    
    if not tree1 or not tree2:
        print("Error: Could not parse one or both code snippets.")
        return 0.0
    
    # Extraer características
    features1 = extract_ast_features(tree1)
    features2 = extract_ast_features(tree2)
    
    # Calcular similitud
    similarity = SequenceMatcher(None, features1, features2).ratio()
    print(f"AST Similarity between code_1 and code_2: {similarity:.4f}")
    return similarity

# Ejecutar análisis
ast_similarity_analysis(code_1, code_2)

AST Similarity between code_1 and code_2: 0.7767


0.7766990291262136