In [100]:
# Todos los imports
import os
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns




In [107]:
# ConPlag dataset import

def loadConPlag(train_path, test_path, labels_path):
    # Cargar los pares de entrenamiento y prueba
    train_pairs = pd.read_csv(train_path, header=None, names=['pair'])
    test_pairs = pd.read_csv(test_path, header=None, names=['pair'])

    # Cargar las etiquetas
    labels = pd.read_csv(labels_path)
    labels['pair'] = labels['sub1'] + '_' + labels['sub2']

    # Unir las etiquetas con los pares de entrenamiento y prueba
    train_pairs = train_pairs.merge(labels[['pair', 'verdict']], on='pair', how='left')
    test_pairs = test_pairs.merge(labels[['pair', 'verdict']], on='pair', how='left')

    return train_pairs, test_pairs

# Cargar los datos
train_pairs, test_pairs = load_pairs_and_labels(
    'datasets/ConPlag/versions/train_pairs.csv',
    'datasets/ConPlag/versions/test_pairs.csv',
    'datasets/ConPlag/versions/labels.csv'
)

# Mostrar los primeros registros
print(train_pairs.head())
print("Columnas del DataFrame después de la unión:", train_pairs.columns)
print(train_pairs[['pair', 'verdict']].head())

# Función para cargar el código de los pares
def load_code(submission_id):
    folder_name = submission_id
    code1_id, code2_id = folder_name.split('_')

    # Rutas de los archivos de código
    code1_path = f'datasets/ConPlag/versions/version_2/{folder_name}/{code1_id}.java'
    code2_path = f'datasets/ConPlag/versions/version_2/{folder_name}/{code2_id}.java'

    # Leer el contenido de los archivos
    with open(code1_path, 'r', encoding='utf-8') as file:
        code1 = file.read()
    with open(code2_path, 'r', encoding='utf-8') as file:
        code2 = file.read()

    return code1, code2

# Cargar los códigos correspondientes a los pares de entrenamiento
train_pairs[['sub1', 'sub2']] = train_pairs['pair'].str.split('_', expand=True)
train_pairs[['code1', 'code2']] = train_pairs['pair'].apply(lambda x: load_code(x)).apply(pd.Series)

print(train_pairs[['pair', 'verdict', 'code1', 'code2']].head())


                pair  verdict
0  2470b521_f6ca6fc8        0
1  a8e2cefc_ee270b2a        0
2  90f01508_e00b1794        0
3  16857116_f3d7ce08        1
4  51151974_c23278ec        0
Columnas del DataFrame después de la unión: Index(['pair', 'verdict'], dtype='object')
                pair  verdict
0  2470b521_f6ca6fc8        0
1  a8e2cefc_ee270b2a        0
2  90f01508_e00b1794        0
3  16857116_f3d7ce08        1
4  51151974_c23278ec        0
                pair  verdict  \
0  2470b521_f6ca6fc8        0   
1  a8e2cefc_ee270b2a        0   
2  90f01508_e00b1794        0   
3  16857116_f3d7ce08        1   
4  51151974_c23278ec        0   

                                               code1  \
0  import com.sun.security.jgss.GSSUtil;\n\nimpor...   
1  import java.io.*;\nimport java.sql.SQLOutput;\...   
2  import java.io.*; \nimport java.util.*;\n\npub...   
3  import javax.swing.plaf.IconUIResource;\nimpor...   
4  import java.io.*;\nimport java.util.*;\n publi...   

                 

In [102]:
# IR-Plag-dataset import
def load_ir_plag_dataset(base_path):
    data = []

    # Recorrer las carpetas case-0n
    for case_num in range(1, 8):  # Asumiendo que los casos son del 1 al 7
        case_folder = os.path.join(base_path, f'case-0{case_num}')
        
        # Cargar el archivo original
        original_folder = os.path.join(case_folder, 'Original')
        original_file = os.listdir(original_folder)[0]  # Solo un archivo
        original_code_path = os.path.join(original_folder, original_file)
        data.append({'type': 'original', 'code': load_code(original_code_path)})

        # Cargar los archivos no plagiados
        non_plagiarized_folder = os.path.join(case_folder, 'non-plagiarized')
        for folder in os.listdir(non_plagiarized_folder):
            folder_path = os.path.join(non_plagiarized_folder, folder)
            for java_file in os.listdir(folder_path):
                file_path = os.path.join(folder_path, java_file)
                if os.path.isfile(file_path):
                    data.append({'type': 'non-plagiarized', 'code': load_code(file_path)})

        # Cargar los archivos plagiados
        plagiarized_folder = os.path.join(case_folder, 'plagiarized')
        for folder in os.listdir(plagiarized_folder):
            folder_path = os.path.join(plagiarized_folder, folder)
            for java_file in os.listdir(folder_path):
                file_path = os.path.join(folder_path, java_file)
                if os.path.isfile(file_path):
                    data.append({'type': 'plagiarized', 'code': load_code(file_path)})

    return pd.DataFrame(data)

def load_code(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Cargar el IR-Plag-Dataset
ir_plag_dataset = load_ir_plag_dataset('datasets/IR-Plag-Dataset')

# Mostrar los primeros registros del dataset
print(ir_plag_dataset.head())

              type                                               code
0         original  \npublic class T1 {\n\tpublic static void main...
1  non-plagiarized  /*\n * To change this license header, choose L...
2  non-plagiarized  \n/**\n *\n * @author 65FBEF05E01FAC390CB3FA07...
3  non-plagiarized  \n\n/**\n *\n * @author CB6AB3315634A1E4D11B09...
4  non-plagiarized  /*\n * To change this license header, choose L...
