In [28]:
import os
import javalang
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [29]:
def extract_ast_features(file_content):
    tree = javalang.parse.parse(file_content)
    token_list = []
    for path, node in tree:
        token_list.append(type(node).__name__)
    return token_list

In [30]:
def read_and_tokenize(directory):
    token_lists = []
    for filename in os.listdir(directory):
        if filename.endswith('.java'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as f:
                content = f.read()
                tokens = extract_ast_features(content)
                token_lists.append(tokens)
    return token_lists

In [31]:
# Paths to your training and test directories
train_noplag_dir = r'C:\Users\droid\Documents\Aplicaciones_Avanzadas\Proyecto\PlagiarismDetector\finalDataset\split\train\noplag'
train_plagio_dir = r'C:\Users\droid\Documents\Aplicaciones_Avanzadas\Proyecto\PlagiarismDetector\finalDataset\split\train\plagio'
test_noplag_dir = r'C:\Users\droid\Documents\Aplicaciones_Avanzadas\Proyecto\PlagiarismDetector\finalDataset\split\test\noplag'
test_plagio_dir = r'C:\Users\droid\Documents\Aplicaciones_Avanzadas\Proyecto\PlagiarismDetector\finalDataset\split\test\plagio'

In [32]:
# Read and tokenize all .java files in the directories
train_noplag_tokens = read_and_tokenize(train_noplag_dir)
train_plagio_tokens = read_and_tokenize(train_plagio_dir)
test_noplag_tokens = read_and_tokenize(test_noplag_dir)
test_plagio_tokens = read_and_tokenize(test_plagio_dir)

In [33]:
# Combine the tokens and create labels
train_tokens = train_noplag_tokens + train_plagio_tokens
train_labels = [0] * len(train_noplag_tokens) + [1] * len(train_plagio_tokens)

test_tokens = test_noplag_tokens + test_plagio_tokens
test_labels = [0] * len(test_noplag_tokens) + [1] * len(test_plagio_tokens)

In [34]:
# Vectorize the training tokens
vectorizer = TfidfVectorizer(analyzer=lambda x: x)  # Pass the tokens directly
X_train = vectorizer.fit_transform(train_tokens)

# Use the fitted vectorizer to transform the test tokens
X_test = vectorizer.transform(test_tokens)

In [35]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, train_labels)

In [36]:
# Convert to DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train_res, label=y_train_res)
dtest = xgb.DMatrix(X_test, label=test_labels)

In [37]:
# Train the XGBoost model
params = {
    'max_depth': 15,
    'eta': 0.3,
    'objective': 'binary:logistic',
    'scale_pos_weight': len(train_noplag_tokens) / len(train_plagio_tokens)  # Adjusting for imbalance
}
num_round = 100

In [38]:
bst = xgb.train(params, dtrain, num_round)

In [39]:
# Predict and evaluate
preds = bst.predict(dtest)
predictions = [1 if pred > 0.5 else 0 for pred in preds]

In [40]:
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions)
recall = recall_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

Accuracy: 0.6682692307692307
Precision: 0.7538461538461538
Recall: 0.725925925925926
