In [1]:
import os
from pathlib import Path
import pandas as pd

from pygments import lex
from pygments.lexers import JavaLexer
from pygments.token import Token

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
ROOT = Path().cwd().parent
BASE_PATH = ROOT / "dataset" / "versions" / "bplag_version_2"
LABELS_PATH = ROOT / "dataset" / "versions" / "labels.csv"

In [3]:
def read_java_files(base_path):
    """
    Recursively reads all .java files from the given base path.
    
    Args:
        base_path (str): Path to the base directory containing submission pairs.
    
    Returns:
        data (list): List of tuples (submission_id, code_content).
    """
    data = []
    
    # Iterate over all submission pairs
    for submission_pair in os.listdir(base_path):
        pair_path = os.path.join(base_path, submission_pair)
        
        if os.path.isdir(pair_path):
            # Iterate over each submission inside the pair
            for submission_id in os.listdir(pair_path):
                submission_path = os.path.join(pair_path, submission_id)
                
                if os.path.isdir(submission_path):
                    # Look for .java files inside the submission directory
                    for file in os.listdir(submission_path):
                        if file.endswith('.java'):
                            file_path = os.path.join(submission_path, file)
                            with open(file_path, 'r', encoding='utf-8') as f:
                                code = f.read()
                                data.append((submission_id, code))
    
    return data

In [4]:
# Read the Java files
java_files_data = read_java_files(BASE_PATH)

# Print the number of submissions loaded and the first few entries
print(f"Total submissions loaded: {len(java_files_data)}")
print("\nFirst 2 submissions loaded:")
for submission_id, code in java_files_data[:2]:
    print(f"Submission ID: {submission_id}\nCode snippet:\n{code[:300]}...\n")

Total submissions loaded: 1822

First 2 submissions loaded:
Submission ID: 5756162d
Code snippet:
import java.util.*;
import java.io.*;
public class EdD {
	public static void main(String[] args) throws Exception{
		int num = 998244353;

		// TODO Auto-generated method stub
 		BufferedReader bf = new BufferedReader(new InputStreamReader(System.in));
 		PrintWriter out = new PrintWriter(System.out...

Submission ID: 808f7516
Code snippet:
import java.io.*;
import java.math.BigInteger;
import java.util.*;

public class Main {
    static int MOD = 1000000007;

    // After writing solution, quick scan for:
    //   array out of bounds
    //   special cases e.g. n=1?
    //
    // Big numbers arithmetic bugs:
    //   int overflow
    ...



## Extracción de tokens

Para esta sección se utiliza la librería Pygments como analizador léxico. Esta librería permite extraer los tokens de un código fuente y clasificarlos en diferentes categorías. En este caso, se utilizará para extraer los tokens de código fuente en Java.

In [5]:
def extract_tokens(code):
    """
    Extracts tokens from the given Java code using Pygments.
    
    Args:
        code (str): Java code as a string.
        
    Returns:
        tokens (list): List of tokens extracted from the code.
    """
    lexer = JavaLexer()
    tokens = []
    for ttype, value in lex(code, lexer):
        if ttype in Token.Name or ttype in Token.Keyword or ttype in Token.Operator:
            val = value.strip()
            if val:
                tokens.append(f"{ttype.__class__.__name__}:{val}")
    return " ".join(tokens)

In [None]:
labels_df = pd.read_csv(LABELS_PATH)

labels_dict = {}
for _, row in labels_df.iterrows():
    key = (row['sub1'], row['sub2'])
    labels_dict[key] = row['verdict']

token_pairs = []
labels = []

for i in range(0, len(java_files_data), 2):
    try:
        id1, code1 = java_files_data[i]
        id2, code2 = java_files_data[i+1]
    except IndexError:
        break
        
    t1 = extract_tokens(code1)
    t2 = extract_tokens(code2)
    token_pairs.append(f"{t1} {t2}")
    
    if (id1, id2) in labels_dict:
        labels.append(labels_dict[(id1, id2)])
    elif (id2, id1) in labels_dict:
        labels.append(labels_dict[(id2, id1)])
    else:
        print(f"Warning: No label found for pair ({id1}, {id2})")
        labels.append(0) 
        

## Vectorización de tokens

Para la vectorización de los tokens, se utilizará la librería Scikit-learn. Esta librería permite transformar los tokens extraídos en vectores numéricos que pueden ser utilizados como entrada para el modelo. En este caso, se utilizará el método `TfidfVectorizer` para transformar los tokens en vectores numéricos. Este método asigna un peso a cada token en función de su frecuencia en el documento y su frecuencia en el corpus. Esto permite que los tokens más relevantes tengan un mayor peso en el vector resultante.

In [7]:
def vectorize(token_pairs):
    """
    Vectorizes the given token pairs using TF-IDF.
    
    Args:
        token_pairs (list): List of tokens to be vectorized.
    
    Returns:
        vectorizer (TfidfVectorizer): Fitted TF-IDF vectorizer.
        X (sparse matrix): TF-IDF matrix of the token pairs.
        Y (list): Labels corresponding to the token pairs.
    """
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(token_pairs)
    Y = labels
    return X, Y

In [None]:
X, Y = vectorize(token_pairs)

Vectorized data shape: (911, 2456)
First 2 labels: [1, 0]


In [9]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 56328 stored elements and shape (911, 2456)>
  Coords	Values
  (0, 17)	0.9806028213497687
  (0, 940)	0.004035402556994933
  (0, 1053)	0.004035402556994933
  (0, 2342)	0.001615931907257326
  (0, 1019)	0.0017183365880872636
  (0, 1792)	0.0032283220455959463
  (0, 345)	0.0016141610227979732
  (0, 644)	0.005421182152332093
  (0, 2087)	0.00242124153419696
  (0, 2373)	0.0032283220455959463
  (0, 1214)	0.004035402556994933
  (0, 2109)	0.0016141610227979732
  (0, 116)	0.0016194795153288493
  (0, 2243)	0.0067821059137036024
  (0, 703)	0.001612973670251707
  (0, 993)	0.04761775017254021
  (0, 1501)	0.0029900445395058374
  (0, 228)	0.005501618137342144
  (0, 179)	0.012713190242901008
  (0, 1413)	0.021791173807772637
  (0, 990)	0.002750809068671072
  (0, 2168)	0.0032283220455959463
  (0, 941)	0.001960253080026935
  (0, 1772)	0.004926175442304638
  (0, 1620)	0.0040398297681433154
  :	:
  (910, 1956)	0.006072492475820603
  (910, 1952)	0.0

In [11]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [24]:
def calcular_matriz_confusion(y_true, y_pred):
    TP = TN = FP = FN = 0

    for i in range(len(y_pred)):
        if y_pred[i] == 1:
            if y_true[i] == 1:
                TP += 1
            else:
                FP += 1
        else:
            if y_true[i] == 0:
                TN += 1
            else:
                FN += 1

    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0

    print("\nTP:", TP)
    print("TN:", TN)
    print("FP:", FP)
    print("FN:", FN)
    print(f"Precisión: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

    return precision, recall, f1

In [12]:
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)

    print(f"\nMatriz de confusión para {name}:")
    precision, recall, f1 = calcular_matriz_confusion(y_test, y_pred)

    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")

    return {
        'name': name,
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [13]:
X_dense = X.toarray()

X_train, X_test, y_train, y_test = train_test_split(X_dense, Y, test_size=0.2, random_state=42)

In [14]:
unique, counts = np.unique(y_train, return_counts=True)
print(f"% class: {np.min(counts)/sum(counts)*100:.2f}%")

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

unique, counts = np.unique(y_train_resampled, return_counts=True)
print(f"% class: {np.min(counts)/sum(counts)*100:.2f}%")

% class: 27.75%
% class: 50.00%


In [30]:
dt_model = DecisionTreeClassifier()

svm_model = SVC()

nb_model = GaussianNB()

lr_model = LogisticRegression()

rf_model = RandomForestClassifier()

In [31]:
base_models = [
    ('DT', dt_model),
    ('SVM', svm_model),
    ('NB', nb_model),
    ('LR', lr_model),
    ('RF', rf_model)
]

In [32]:
for name, model in base_models:
    cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=10, scoring='accuracy')
    print(f"{name}: Accuracy = {cv_scores.mean():.4f}, Desviación =  {cv_scores.std():.4f}")
    model.fit(X_train_resampled, y_train_resampled)

DT: Accuracy = 0.8205, Desviación =  0.0819
SVM: Accuracy = 0.6207, Desviación =  0.0364
NB: Accuracy = 0.8764, Desviación =  0.0208
LR: Accuracy = 0.7872, Desviación =  0.0699
RF: Accuracy = 0.8927, Desviación =  0.0681


In [33]:
results = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1-Score': [],
    'AUC-ROC': []
}

roc_curves = []

In [34]:
for name, model in base_models:
    print(f"\nEvaluación de {name}")
    model.fit(X_train_resampled, y_train_resampled)
    result = evaluate_model(name, model, X_test, y_test)
    results['Model'].append(name)
    results['Accuracy'].append(result['accuracy'])
    results['Precision'].append(result['precision'])
    results['Recall'].append(result['recall'])
    results['F1-Score'].append(result['f1'])


Evaluación de DT

Matriz de confusión para DT:

TP: 28
TN: 101
FP: 33
FN: 21
Precisión: 0.4590
Recall: 0.5714
F1-Score: 0.5091
Accuracy: 0.7049

Evaluación de SVM

Matriz de confusión para SVM:

TP: 8
TN: 130
FP: 4
FN: 41
Precisión: 0.6667
Recall: 0.1633
F1-Score: 0.2623
Accuracy: 0.7541

Evaluación de NB

Matriz de confusión para NB:

TP: 24
TN: 107
FP: 27
FN: 25
Precisión: 0.4706
Recall: 0.4898
F1-Score: 0.4800
Accuracy: 0.7158

Evaluación de LR

Matriz de confusión para LR:

TP: 21
TN: 109
FP: 25
FN: 28
Precisión: 0.4565
Recall: 0.4286
F1-Score: 0.4421
Accuracy: 0.7104

Evaluación de RF

Matriz de confusión para RF:

TP: 21
TN: 122
FP: 12
FN: 28
Precisión: 0.6364
Recall: 0.4286
F1-Score: 0.5122
Accuracy: 0.7814


In [35]:
print(f"{'Model':<20}{'Accuracy':<10}{'Precision':<10}{'Recall':<10}{'F1-Score':<10}")
for i in range(len(results['Model'])):
    print(f"{results['Model'][i]:<20}{results['Accuracy'][i]:<10.4f}{results['Precision'][i]:<10.4f}{results['Recall'][i]:<10.4f}{results['F1-Score'][i]:<10.4f}")

Model               Accuracy  Precision Recall    F1-Score  
DT                  0.7049    0.4590    0.5714    0.5091    
SVM                 0.7541    0.6667    0.1633    0.2623    
NB                  0.7158    0.4706    0.4898    0.4800    
LR                  0.7104    0.4565    0.4286    0.4421    
RF                  0.7814    0.6364    0.4286    0.5122    


In [27]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split, KFold

In [28]:
voting_model_final = VotingClassifier(estimators=base_models, voting='hard')

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
}

cv_results = cross_validate(voting_model_final, X_train_resampled, y_train_resampled, cv=kfold, scoring=scoring, return_estimator=True)

print("\nResultados de la validación cruzada (k=5) para el modelo propuesto:")
print(f"Accuracy: {cv_results['test_accuracy'].mean()*100:.2f}%")
print(f"Precision: {cv_results['test_precision'].mean()*100:.2f}%")
print(f"Sensitivity: {cv_results['test_recall'].mean()*100:.2f}%")
print(f"F1-Score: {cv_results['test_f1'].mean()*100:.2f}%")


Resultados de la validación cruzada (k=5) para el modelo propuesto:
Accuracy: 84.51%
Precision: 92.23%
Sensitivity: 75.46%
F1-Score: 82.96%


In [29]:
voting_model_final.fit(X_train_resampled, y_train_resampled)
final_result = evaluate_model('Ensemble Final', voting_model_final, X_test, y_test)

print(f"Accuracy: {final_result['accuracy']:.2f}%")
print(f"Precision: {final_result['precision']:.2f}%")
print(f"Recall: {final_result['recall']:.2f}%")
print(f"F1-Score: {final_result['f1']:.2f}%")


Matriz de confusión para Ensemble Final:

TP: 15
TN: 123
FP: 11
FN: 34
Precisión: 0.5769
Recall: 0.3061
F1-Score: 0.4000
Accuracy: 0.7541
Accuracy: 0.75%
Precision: 0.58%
Recall: 0.31%
F1-Score: 0.40%
