In [39]:
import os
import pandas as pd
import javalang
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb

In [26]:
def read_java_file(file_path):
    """Function to read a Java file and return its content as a string."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

In [27]:
def extract_ast_nodes(code):
    """Extract AST nodes from Java code."""
    try:
        tokens = list(javalang.tokenizer.tokenize(code))
        parser = javalang.parser.Parser(tokens)
        tree = parser.parse()

        ast_nodes = []
        for path, node in tree:
            if isinstance(node, javalang.tree.Node):
                ast_nodes.append(node.__class__.__name__)
        return ' '.join(ast_nodes)
    except (javalang.parser.JavaSyntaxError, javalang.tokenizer.LexerError) as e:
        print(f"Error al analizar el código Java: {e}")
        return ''

In [28]:
def preprocess_data_with_ast(directory):
    """Preprocess all Java files in the specified directory and return a DataFrame."""
    data = []
    labels_path = os.path.join(directory, 'versions', 'labels.csv')  # Correct path to labels.csv
    labels = pd.read_csv(labels_path)
    
    # Verificar las columnas del archivo CSV
    print("Columnas disponibles en labels.csv:", labels.columns)
    
    # Asegurarse de que la columna correcta está presente
    label_column = 'veredict'  # Cambia esto si el nombre de la columna es diferente
    if label_column not in labels.columns:
        raise KeyError(f"La columna '{label_column}' no se encuentra en labels.csv")
    
    for index, row in labels.iterrows():
        sub1 = row['sub1']
        sub2 = row['sub2']
        label = row[label_column]  # Use the correct label column
        pair_id = f"{sub1}_{sub2}"  # Construct pair_id from sub1 and sub2
        pair_folder = os.path.join(directory, 'versions', 'version_2', pair_id)
        
        # Paths to the two Java files in the pair folder
        file1_path = os.path.join(pair_folder, f"{sub1}.java")
        file2_path = os.path.join(pair_folder, f"{sub2}.java")
        
        # Read and extract AST nodes from both files
        if os.path.exists(file1_path) and os.path.exists(file2_path):
            code1 = read_java_file(file1_path)
            code2 = read_java_file(file2_path)
            ast_nodes1 = extract_ast_nodes(code1)
            ast_nodes2 = extract_ast_nodes(code2)
            
            if ast_nodes1 and ast_nodes2:  # Only combine if both are non-empty
                # Combine the AST nodes from both files
                combined_ast_nodes = ast_nodes1 + ' ' + ast_nodes2
                data.append([combined_ast_nodes, label])
    
    return pd.DataFrame(data, columns=['code', 'label'])

In [29]:
# Usage
data_directory = r'C:\Users\droid\Documents\Aplicaciones_Avanzadas\Proyecto\PlagiarismDetector\data\conplag_version_2'
processed_data = preprocess_data_with_ast(data_directory)
print(processed_data.head())

Columnas disponibles en labels.csv: Index(['sub1', 'sub2', 'problem', 'veredict'], dtype='object')
Error al analizar el código Java: 
                                                code  label
0  CompilationUnit Import Import ClassDeclaration...      0
1  CompilationUnit Import ClassDeclaration Method...      0
2  CompilationUnit Import Import ClassDeclaration...      1
3  CompilationUnit Import Import Import Import Cl...      0
4  CompilationUnit Import Import Import ClassDecl...      0


In [30]:
# Convert text data to numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed
X = tfidf_vectorizer.fit_transform(processed_data['code'])
y = processed_data['label']

In [31]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# Convert to DMatrix, which is a data structure that XGBoost uses
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [62]:
# Set XGBoost parameters
params = {
    'objective': 'binary:logistic',  # for binary classification
    'max_depth': 8,  # You can tune this parameter
    'eta': 0.3,  # Learning rate
    'eval_metric': 'logloss'  # Evaluation metric
}

In [63]:
# Train the model
num_rounds = 100  # Number of boosting rounds
bst = xgb.train(params, dtrain, num_rounds)

In [64]:
# Evaluate the model
preds = bst.predict(dtest)
predictions = [round(value) for value in preds]

In [65]:
# Calculate accuracy
accuracy = sum([1 if pred == label else 0 for pred, label in zip(predictions, y_test)]) / len(y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 82.97%
