In [13]:
import os
import csv
import re
from pathlib import Path

def clean_code(code):
    """
    Clean code by removing:
    1. Comments (single-line and multi-line)
    2. Extra whitespace
    3. Empty lines
    4. Leading/trailing whitespace
    """
    # Remove multi-line comments
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    
    # Remove single-line comments
    code = re.sub(r'//.*?$', '', code, flags=re.MULTILINE)
    
    # Remove # comments (for Python, shell scripts, etc.)
    code = re.sub(r'#.*?$', '', code, flags=re.MULTILINE)
    
    # Remove HTML comments
    code = re.sub(r'<!--.*?-->', '', code, flags=re.DOTALL)
    
    # Remove extra whitespace
    code = re.sub(r'\s+', ' ', code)
    
    # Remove leading/trailing whitespace from each line
    code = '\n'.join(line.strip() for line in code.splitlines())
    
    # Remove empty lines
    code = '\n'.join(line for line in code.splitlines() if line.strip())
    
    return code

def read_file_content(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            return clean_code(content)
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return ""

def find_code_files(directory):
    """Recursively find all code files in a directory."""
    code_files = []
    for path in Path(directory).rglob('*'):
        if path.is_file() and not path.name.startswith('.'):
            code_files.append(path)
    return code_files

def generate_pairs(base_dir):
    pairs = []
    
    # Walk through all case directories
    for case_dir in sorted(Path(base_dir).glob('case-*')):
        case_name = case_dir.name
        print(f"Processing {case_name}...")
        
        # Get original files
        original_dir = case_dir / 'original'
        if not original_dir.exists():
            continue
            
        original_files = find_code_files(original_dir)
        
        # Process plagiarized pairs
        plagiarized_dir = case_dir / 'plagiarized'
        if plagiarized_dir.exists():
            plagiarized_files = find_code_files(plagiarized_dir)
            for orig_file in original_files:
                for plag_file in plagiarized_files:
                    code1 = read_file_content(orig_file)
                    code2 = read_file_content(plag_file)
                    if code1 and code2:
                        pairs.append({
                            'code1': code1,
                            'code2': code2,
                            'label': 1
                        })
        
        # Process non-plagiarized pairs
        non_plagiarized_dir = case_dir / 'non-plagiarized'
        if non_plagiarized_dir.exists():
            non_plagiarized_files = find_code_files(non_plagiarized_dir)
            for orig_file in original_files:
                for non_plag_file in non_plagiarized_files:
                    code1 = read_file_content(orig_file)
                    code2 = read_file_content(non_plag_file)
                    if code1 and code2:
                        pairs.append({
                            'code1': code1,
                            'code2': code2,
                            'label': 0
                        })
    
    return pairs

def main():
    # Get the absolute path to the reto-tc3002b directory
    current_dir = os.getcwd()
    if 'scripts' in current_dir:
        # If we're in the scripts directory, go up one level
        base_dir = os.path.dirname(current_dir)
    else:
        base_dir = current_dir
    
    # Now navigate to the IR-Plag-Dataset directory
    dataset_dir = os.path.join(base_dir, 'IR-Plag-Dataset')
    if not os.path.exists(dataset_dir):
        raise FileNotFoundError(f"Could not find IR-Plag-Dataset directory at {dataset_dir}")
    
    pairs = generate_pairs(dataset_dir)
    
    # Write to CSV in the reto-tc3002b directory
    output_file = os.path.join(base_dir, 'code_pairs.csv')
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['code1', 'code2', 'label'])
        writer.writeheader()
        writer.writerows(pairs)
    
    print(f"Generated {len(pairs)} pairs in {output_file}")

if __name__ == '__main__':
    main()

Processing case-01...
Processing case-02...
Processing case-03...
Processing case-04...
Processing case-05...
Processing case-06...
Processing case-07...
Generated 460 pairs in /Users/danielhurtado/Library/CloudStorage/GoogleDrive-a01707774@tec.mx/My Drive/Tec/FJ25/IA/reto-tc3002b/code_pairs.csv
