In [44]:
import os
import pandas as pd
import javalang
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [None]:
data = []

base_dir = "C:/Users/adria/OneDrive - Instituto Tecnologico y de Estudios Superiores de Monterrey/AppAvanzadas/IR-Plag-Dataset/IR-Plag-Dataset"

for case_name in os.listdir(base_dir):
    case_path = os.path.join(base_dir, case_name)
    if not os.path.isdir(case_path):
        continue

    # Original
    orig_path = os.path.join(case_path, "original")
    for f in os.listdir(orig_path):
        if f.endswith(".java"):
            data.append({
                "file_path": os.path.join(orig_path, f),
                "label": "original",
                "case_id": case_name
            })

    # Non-plagiarized
    non_plag_path = os.path.join(case_path, "non-plagiarized")
    for folder in os.listdir(non_plag_path):
        subfolder = os.path.join(non_plag_path, folder)
        for f in os.listdir(subfolder):
            if f.endswith(".java"):
                data.append({
                    "file_path": os.path.join(subfolder, f),
                    "label": "non-plagiarized",
                    "case_id": case_name
                })

    # Plagiarized
    plag_path = os.path.join(case_path, "plagiarized")
    for level in os.listdir(plag_path):  # L1 to L6
        level_path = os.path.join(plag_path, level)
        for folder in os.listdir(level_path):
            file_folder = os.path.join(level_path, folder)
            for f in os.listdir(file_folder):
                if f.endswith(".java"):
                    data.append({
                        "file_path": os.path.join(file_folder, f),
                        "label": "plagiarized",
                        "case_id": case_name
                    })

df = pd.DataFrame(data)
print(df.head())
print(f"Total archivos: {len(df)}")


                                           file_path            label  case_id
0  C:/Users/adria/OneDrive - Instituto Tecnologic...         original  case-01
1  C:/Users/adria/OneDrive - Instituto Tecnologic...  non-plagiarized  case-01
2  C:/Users/adria/OneDrive - Instituto Tecnologic...  non-plagiarized  case-01
3  C:/Users/adria/OneDrive - Instituto Tecnologic...  non-plagiarized  case-01
4  C:/Users/adria/OneDrive - Instituto Tecnologic...  non-plagiarized  case-01
Total archivos: 467


In [23]:
def print_detailed_counts(df):
    # Conteo total
    total_files = len(df)
    print(f"\nTOTAL ARCHIVOS: {total_files}")
    
    # Conteo por categoría principal
    print("\nCATEGORÍA PRINCIPAL:")
    main_counts = df['label'].value_counts()
    print(main_counts)
    
    # Detalle para plagiados (por nivel)
    if 'plagiarism_level' in df.columns:
        print("\nNIVELES DE PLAGIO:")
        plag_counts = df[df['label'] == 'plagiarized']['plagiarism_level'].value_counts().sort_index()
        print(plag_counts)
    
    # Detalle por caso (case-01, case-02, etc.)
    print("\nPOR CASO:")
    case_counts = df.groupby('case_id')['label'].value_counts().unstack(fill_value=0)
    print(case_counts)

# Ejecutar la función
print_detailed_counts(df)


TOTAL ARCHIVOS: 467

CATEGORÍA PRINCIPAL:
label
plagiarized        355
non-plagiarized    105
original             7
Name: count, dtype: int64

POR CASO:
label    non-plagiarized  original  plagiarized
case_id                                        
case-01               15         1           40
case-02               15         1           54
case-03               15         1           52
case-04               15         1           54
case-05               15         1           53
case-06               15         1           51
case-07               15         1           51


In [24]:
# Entrenamiento con case-01 a case-05
train_cases = ['case-01', 'case-02', 'case-03', 'case-04', 'case-05']
# Prueba con case-06 y case-07
test_cases = ['case-06', 'case-07']

train_df = df[df['case_id'].isin(train_cases)].reset_index(drop=True)
test_df = df[df['case_id'].isin(test_cases)].reset_index(drop=True)

print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")
print(train_df['label'].value_counts())
print(test_df['label'].value_counts())


Train size: 333
Test size: 134
label
plagiarized        253
non-plagiarized     75
original             5
Name: count, dtype: int64
label
plagiarized        102
non-plagiarized     30
original             2
Name: count, dtype: int64


In [25]:
import os

split_dir = "data"
os.makedirs(split_dir, exist_ok=True)

# Guarda los archivos
train_df.to_csv(os.path.join(split_dir, "train.csv"), index=False)
test_df.to_csv(os.path.join(split_dir, "test.csv"), index=False)


In [30]:
train_df = pd.read_csv("data/train.csv")


In [34]:
# Ejemplo de cómo leer un archivo Java y extraer tokens AST
def read_java_code(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        return f.read()

def get_ast_tokens(code):
    try:
        tree = javalang.parse.parse(code)
        tokens = []
        for path, node in tree:
            tokens.append(type(node).__name__)  # e.g., MethodDeclaration, IfStatement
        return tokens
    except:
        return []  # fallback in caso de error de parseo


In [35]:
def ast_token_string(file_path):
    code = read_java_code(file_path)
    tokens = get_ast_tokens(code)
    return ' '.join(tokens)

# Aplica a todos los archivos del train
train_df['ast_tokens'] = train_df['file_path'].apply(ast_token_string)

# Vectoriza
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df['ast_tokens'])

# Etiquetas
y_train = train_df['label'].apply(lambda x: 1 if x == 'plagiarized' else 0)


In [36]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [38]:
# Evaluate my model in train set

y_train_pred = model.predict(X_train)
print("Train Classification Report")
print(classification_report(y_train, y_train_pred))
print("Train Confusion Matrix")
print(confusion_matrix(y_train, y_train_pred))


Train Classification Report
              precision    recall  f1-score   support

           0       0.99      0.88      0.93        80
           1       0.96      1.00      0.98       253

    accuracy                           0.97       333
   macro avg       0.97      0.94      0.95       333
weighted avg       0.97      0.97      0.97       333

Train Confusion Matrix
[[ 70  10]
 [  1 252]]


In [39]:
test_df = pd.read_csv("data/test.csv")


In [40]:
# Generamos los tokens AST para cada archivo
test_df['ast_tokens'] = test_df['file_path'].apply(ast_token_string)


In [41]:
# IMPORTANTE: solo transform, no fit_transform
X_test = vectorizer.transform(test_df['ast_tokens'])
y_test = test_df['label'].apply(lambda x: 1 if x == 'plagiarized' else 0)


In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Predicciones
y_pred = model.predict(X_test)

# Métricas
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["non-plagiarized", "plagiarized"]))


Accuracy: 0.2537313432835821
Precision: 1.0
Recall: 0.0196078431372549
F1 Score: 0.038461538461538464

Confusion Matrix:
 [[ 32   0]
 [100   2]]

Classification Report:
                  precision    recall  f1-score   support

non-plagiarized       0.24      1.00      0.39        32
    plagiarized       1.00      0.02      0.04       102

       accuracy                           0.25       134
      macro avg       0.62      0.51      0.21       134
   weighted avg       0.82      0.25      0.12       134

