In [49]:
import os
from pathlib import Path

from pygments import lex
from pygments.lexers import JavaLexer
from pygments.token import Token

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
ROOT = Path().cwd().parent
BASE_PATH = ROOT / "dataset" / "versions" / "bplag_version_2"

In [51]:
def read_java_files(base_path):
    """
    Recursively reads all .java files from the given base path.
    
    Args:
        base_path (str): Path to the base directory containing submission pairs.
    
    Returns:
        data (list): List of tuples (submission_id, code_content).
    """
    data = []
    
    # Iterate over all submission pairs
    for submission_pair in os.listdir(base_path):
        pair_path = os.path.join(base_path, submission_pair)
        
        if os.path.isdir(pair_path):
            # Iterate over each submission inside the pair
            for submission_id in os.listdir(pair_path):
                submission_path = os.path.join(pair_path, submission_id)
                
                if os.path.isdir(submission_path):
                    # Look for .java files inside the submission directory
                    for file in os.listdir(submission_path):
                        if file.endswith('.java'):
                            file_path = os.path.join(submission_path, file)
                            with open(file_path, 'r', encoding='utf-8') as f:
                                code = f.read()
                                data.append((submission_id, code))
    
    return data

In [52]:
# Read the Java files
java_files_data = read_java_files(BASE_PATH)

# Print the number of submissions loaded and the first few entries
print(f"Total submissions loaded: {len(java_files_data)}")
print("\nFirst 2 submissions loaded:")
for submission_id, code in java_files_data[:2]:
    print(f"Submission ID: {submission_id}\nCode snippet:\n{code[:300]}...\n")

Total submissions loaded: 1822

First 2 submissions loaded:
Submission ID: 0017d438
Code snippet:
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;
public class Main {
    static int modulo=998244353;
    public static void main(String[] args) {
       
        FastScanner in = new FastScanner();
     ...

Submission ID: 9852706b
Code snippet:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.*;

public class A {
    static List<Integer> [] adj;
    static ArrayList<Integer> temp;
    static int mod = (int) 1e9+7;
    static boolean[] vis = new boolean...



## Extracción de tokens

Para esta sección se utiliza la librería Pygments como analizador léxico. Esta librería permite extraer los tokens de un código fuente y clasificarlos en diferentes categorías. En este caso, se utilizará para extraer los tokens de código fuente en Java.

In [53]:
def extract_tokens(code):
    """
    Extracts tokens from the given Java code using Pygments.
    
    Args:
        code (str): Java code as a string.
        
    Returns:
        tokens (list): List of tokens extracted from the code.
    """
    lexer = JavaLexer()
    tokens = []
    for ttype, value in lex(code, lexer):
        if ttype in Token.Name or ttype in Token.Keyword or ttype in Token.Operator:
            val = value.strip()
            if val:
                tokens.append(f"{ttype.__class__.__name__}:{val}")
    return " ".join(tokens)

In [54]:
token_pairs = []
labels = []

for i in range(0, len(java_files_data), 2):
    try:
        id1, code1 = java_files_data[i]
        id2, code2 = java_files_data[i+1]
    except IndexError:
        break
    t1 = extract_tokens(code1)
    t2 = extract_tokens(code2)
    token_pairs.append(f"{t1} {t2}")
    labels.append(1 if id1 == id2 else 0)

## Vectorización de tokens

Para la vectorización de los tokens, se utilizará la librería Scikit-learn. Esta librería permite transformar los tokens extraídos en vectores numéricos que pueden ser utilizados como entrada para el modelo. En este caso, se utilizará el método `TfidfVectorizer` para transformar los tokens en vectores numéricos. Este método asigna un peso a cada token en función de su frecuencia en el documento y su frecuencia en el corpus. Esto permite que los tokens más relevantes tengan un mayor peso en el vector resultante.

In [55]:
def vectorize(token_pairs):
    """
    Vectorizes the given token pairs using TF-IDF.
    
    Args:
        token_pairs (list): List of tokens to be vectorized.
    
    Returns:
        vectorizer (TfidfVectorizer): Fitted TF-IDF vectorizer.
        X (sparse matrix): TF-IDF matrix of the token pairs.
        Y (list): Labels corresponding to the token pairs.
    """
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(token_pairs)
    Y = labels
    return X, Y

In [56]:
X, Y = vectorize(token_pairs)