In [2]:
import os
import pandas as pd
import re

In [3]:
LABELS_PATH = "./raw_labels.csv"
DATA_DIR = "./raw_dataset"
OUTPUT_CSV = "plagiarism_pairs.csv"

df = pd.read_csv(LABELS_PATH)

In [4]:
# Limpiar nombres de columnas
df.columns = df.columns.str.strip()
print(f"Columnas originales: {df.columns.tolist()}")

# Limpiar contenido
df['verdict'] = df['verdict'].astype(int)
df['sub1'] = df['sub1'].astype(str).str.strip()
df['sub2'] = df['sub2'].astype(str).str.strip()

# Usar directamente la columna binaria
df['plagiarized'] = df['verdict']
df['id_pair'] = df['sub1'] + '_' + df['sub2']

Columnas originales: ['sub1', 'sub2', 'problem', 'verdict']


In [5]:
rows = []
def remove_comments(code):
    # Eliminar comentarios de una línea (//...)
    code = re.sub(r'//.*$', '', code, flags=re.MULTILINE)
    # Eliminar comentarios multilínea (/*...*/ y /**...*/)
    code = re.sub(r'/\*[\s\S]*?\*/', '', code)
    # Eliminar espacios en blanco sobrantes y convertir a una sola línea
    code = re.sub(r'\s+', ' ', code).strip()
    return code

for _, row in df.iterrows():
    pair_id = row['id_pair']
    folder_path = os.path.join(DATA_DIR, pair_id)

    if not os.path.isdir(folder_path):
        print(f"⚠️ Carpeta no encontrada: {folder_path}")
        continue

    files = os.listdir(folder_path)
    if len(files) != 2:
        print(f"⚠️ Par incompleto en: {folder_path}")
        continue

    files = sorted(files)

    file1_path = os.path.join(folder_path, files[0])
    file2_path = os.path.join(folder_path, files[1])

    try:
        with open(file1_path, 'r', encoding='utf-8', errors='ignore') as f1:
            code1 = f1.read()
            code1 = remove_comments(code1)
        with open(file2_path, 'r', encoding='utf-8', errors='ignore') as f2:
            code2 = f2.read()
            code2 = remove_comments(code2)
    except Exception as e:
        print(f"❌ Error leyendo {pair_id}: {e}")
        continue

    rows.append({
        'code1': code1,
        'code2': code2,
        'label': int(row['plagiarized'])
    })


In [6]:
df_out = pd.DataFrame(rows)
df_out.to_csv(OUTPUT_CSV, index=False)

print(f"✅ CSV generado con {len(df_out)} pares → {OUTPUT_CSV}")

✅ CSV generado con 911 pares → plagiarism_pairs.csv
