In [1]:
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
import os

Model training complete.


In [None]:
# Define paths
data_path = r'C:\Users\droid\Documents\Aplicaciones_Avanzadas\Proyecto\PlagiarismDetector\finalDataset\javafiles'
labels_path = r'C:\Users\droid\Documents\Aplicaciones_Avanzadas\Proyecto\PlagiarismDetector\finalDataset\unify_labels\javafiles_labels.csv'

In [None]:
# Load the labels
df_labels = pd.read_csv(labels_path)

In [None]:
# Balance the dataset
df_majority = df_labels[df_labels.veredict == 0]
df_minority = df_labels[df_labels.veredict == 1]

In [None]:
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     
                                 n_samples=len(df_majority),    
                                 random_state=123)

In [None]:
# Combine majority class with upsampled minority class
df_balanced = pd.concat([df_majority, df_minority_upsampled])

In [None]:
# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Read Java files, preprocess and extract features
def read_java_files(sub1, sub2):
    # Build paths to java files
    path1 = os.path.join(data_path, f"{sub1}.java")
    path2 = os.path.join(data_path, f"{sub2}.java")
    
    try:
        with open(path1, 'r', encoding='utf-8') as file:
            content1 = file.read()
        with open(path2, 'r', encoding='utf-8') as file:
            content2 = file.read()
    except FileNotFoundError:
        content1, content2 = "", ""
    
    return content1 + " " + content2

In [None]:
# Apply the function to read Java files
df_balanced['text'] = df_balanced.apply(lambda x: read_java_files(x['sub1'], x['sub2']), axis=1)

In [None]:
# Feature extraction with CountVectorizer
vectorizer = CountVectorizer(max_features=1000)  # Adjust number of features
X = vectorizer.fit_transform(df_balanced['text']).toarray()
y = df_balanced['veredict'].values

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
# Initialize XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [None]:
# Train the model
xgb_model.fit(X_train, y_train)

In [None]:
# Model is now trained and can be used for predictions or evaluation
print("Model training complete.")

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer
import os

# Load trained model and vectorizer
xgb_model = xgb.XGBClassifier()  # Assuming model is loaded
vectorizer = CountVectorizer()  # Assuming vectorizer is loaded

# Define function to evaluate the model
def evaluate_model(X_train, y_train, X_test, y_test):
    # Predictions on training set
    train_preds = xgb_model.predict(X_train)
    train_preds_proba = xgb_model.predict_proba(X_train)[:, 1]
    train_accuracy = accuracy_score(y_train, train_preds)
    train_loss = log_loss(y_train, train_preds_proba)
    
    # Predictions on test set
    test_preds = xgb_model.predict(X_test)
    test_preds_proba = xgb_model.predict_proba(X_test)[:, 1]
    test_accuracy = accuracy_score(y_test, test_preds)
    test_loss = log_loss(y_test, test_preds_proba)
    
    # Confusion matrix
    cm = confusion_matrix(y_test, test_preds)
    
    print("Training Accuracy:", train_accuracy)
    print("Training Loss:", train_loss)
    print("Test Accuracy:", test_accuracy)
    print("Test Loss:", test_loss)
    print("Confusion Matrix:\n", cm)

# Example of using evaluate_model function
# evaluate_model(X_train, y_train, X_test, y_test)  # Uncomment and use actual data

# Function to test the model with new Java files
def test_new_java_files(file_path1, file_path2):
    # Read the Java files
    try:
        with open(file_path1, 'r', encoding='utf-8') as file:
            content1 = file.read()
        with open(file_path2, 'r', encoding='utf-8') as file:
            content2 = file.read()
        content = content1 + " " + content2
    except FileNotFoundError:
        return "File not found."
    
    # Process the content through the vectorizer
    content_vector = vectorizer.transform([content]).toarray()
    
    # Make prediction
    prediction = xgb_model.predict(content_vector)
    prediction_proba = xgb_model.predict_proba(content_vector)[0, 1]
    
    return f"Prediction: {'Plagiarism' if prediction[0] == 1 else 'No Plagiarism'}, Probability: {prediction_proba:.4f}"

# Example of testing new Java files
# result = test_new_java_files('path_to_file1.java', 'path_to_file2.java')
# print(result)