In [68]:
import os
import shutil
import re
import json
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from pygments import lex
from pygments.lexers import CppLexer
from pygments.token import Token

In [69]:
# Function for extracting c++ files
def filter_cpp_files(input_folder, output_folder):
    # List of C++ file extensions to filter
    cpp_extensions = ['.cpp', '.hpp', '.h', '.cc', '.cxx']

    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate through all files in the input folder
    for root, _, files in os.walk(input_folder):
        for file in files:
            # Check if the file has a C++ extension
            if any(file.endswith(ext) for ext in cpp_extensions):
                # Full path to the input file
                input_file_path = os.path.join(root, file)
                # Full path to the output file
                output_file_path = os.path.join(output_folder, file)
                # Copy the file to the output folder
                shutil.copy(input_file_path, output_file_path)
                print(f"Copied: {input_file_path} -> {output_file_path}")

In [70]:
# Extract cpp files from a given folder
input_folder = "data/leveldb-main"
output_folder = "data/cpp_files"
filter_cpp_files(input_folder, output_folder)

Copied: data/leveldb-main\benchmarks\db_bench.cc -> data/cpp_files\db_bench.cc
Copied: data/leveldb-main\benchmarks\db_bench_log.cc -> data/cpp_files\db_bench_log.cc
Copied: data/leveldb-main\benchmarks\db_bench_sqlite3.cc -> data/cpp_files\db_bench_sqlite3.cc
Copied: data/leveldb-main\benchmarks\db_bench_tree_db.cc -> data/cpp_files\db_bench_tree_db.cc
Copied: data/leveldb-main\db\autocompact_test.cc -> data/cpp_files\autocompact_test.cc
Copied: data/leveldb-main\db\builder.cc -> data/cpp_files\builder.cc
Copied: data/leveldb-main\db\builder.h -> data/cpp_files\builder.h
Copied: data/leveldb-main\db\c.cc -> data/cpp_files\c.cc
Copied: data/leveldb-main\db\corruption_test.cc -> data/cpp_files\corruption_test.cc
Copied: data/leveldb-main\db\dbformat.cc -> data/cpp_files\dbformat.cc
Copied: data/leveldb-main\db\dbformat.h -> data/cpp_files\dbformat.h
Copied: data/leveldb-main\db\dbformat_test.cc -> data/cpp_files\dbformat_test.cc
Copied: data/leveldb-main\db\db_impl.cc -> data/cpp_files\

In [71]:
import re

def tokenize_cpp_code(code):
    # Regular expressions for common C++ tokens
    token_pattern = re.compile(r'[a-zA-Z_][a-zA-Z0-9_]*|[{}()\[\];,.<>:=+-/*&|^%!~?]|"[^"]*"|\d+')
    
    tokens_with_spacing = []
    
    # Split the code by lines to capture newlines
    lines = code.splitlines(keepends=True)  # keepends=True to preserve newlines
    
    for line in lines:
        position = 0  # Track the position of the cursor in the line
        tokens = token_pattern.findall(line)

        for token in tokens:
            token_start = line.find(token, position)

            # Capture spacing before the token (whitespace)
            spacing = line[position:token_start]

            # Handle spacing: Add a space token for each space
            if spacing:
                for char in spacing:
                    if char == ' ':
                        tokens_with_spacing.append("<space>")  # Represent each space as <s-1>
                    elif char == '\t':
                        tokens_with_spacing.append("<space>")  # Optionally represent tabs as <s-1>
            
            # Add the token itself
            tokens_with_spacing.append(token)
            position = token_start + len(token)

        # Handle newlines at the end of the line
        if line.endswith("\n"):
            newline_count = line.count("\n")
            tokens_with_spacing.extend([f"<newline>"] * newline_count)  # Represent each newline as <n-1>

    return tokens_with_spacing

In [72]:
# Function to process a folder of C++ files
def preprocess_cpp_folder(folder_path, output_file):
    all_files_data = []
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".cpp") or file.endswith(".h"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    code = f.read()
                    
                    # Tokenize the code and capture formatting
                    tokens_with_spacing = tokenize_cpp_code(code)
                    
                    # Serialize the tokens with spacing into a structured format (e.g., list of tokens)
                    all_files_data.append({
                        'file_name': file,
                        'tokens': tokens_with_spacing
                    })
    
    # Write the preprocessed data to an output file
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for file_data in all_files_data:
            # Join tokens with a space between them
            output_line = ' '.join(file_data['tokens'])
            out_f.write(output_line + '\n')  # Add a newline after each file's tokens

# Example usage
if __name__ == "__main__":
    preprocess_cpp_folder('path/to/cpp/files', 'output.txt')


In [73]:
# PreProcess the data
folder_path = "data/cpp_files"
output_file = "data/preprocessed_cpp_data.txt"
preprocess_cpp_folder(folder_path, output_file)

print(f"Preprocessed C++ files saved to {output_file}")

Preprocessed C++ files saved to data/preprocessed_cpp_data.txt
