In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
from typing import List, Dict

class CodeSplitter:
    """Splitter specialized for source code documents"""
    def __init__(self, language: str = "python"):
        self.language = language
        self.splitter = self._create_language_splitter()

    def _create_language_splitter(self):
        """Create a splitter for specific programming language"""
        return RecursiveCharacterTextSplitter.from_language(
            language=self.language,
            chunk_size=1000,
            chunk_overlap=200
        )

    def split_code(self, code: str) -> List[str]:
        """Split code while preserving structure"""
        return self.splitter.split_text(code)

In [2]:
def test_code_splitting():
    """Test different code splitting scenarios"""

    # Python code example
    python_code = '''
    class DataProcessor:
        """Handle data processing operations."""

        def __init__(self, data_path: str):
            self.data_path = data_path
            self.processed = False

        def process(self):
            """Process the data."""
            print(f"Processing data from {self.data_path}")
            self.processed = True
    '''

    python_splitter = CodeSplitter(language="python")
    chunks = python_splitter.split_code(python_code)

    for i, chunk in enumerate(chunks, 1):
        print(f"\nChunk {i}:")
        print(chunk)

**Code Text Splitting: Maintaining Programming Logic**

In [None]:
def test_multilanguage_support():
    """Test code splitting across different programming languages"""

    # Python Example
    python_code = '''
    @dataclass
    class DataProcessor:
        """Process data with validation."""
        data_path: str

        def process(self) -> None:
            """Process the data with logging."""
            logger.info(f"Processing {self.data_path}")
            self._validate()
            self._transform()
    '''

    # JavaScript Example (using 'js' instead of 'javascript')
    js_code = '''
    class ApiClient {
        constructor(baseUrl) {
            this.baseUrl = baseUrl;
            this.headers = new Headers();
        }

        async fetchData() {
            const response = await fetch(this.baseUrl);
            return response.json();
        }
    }
    '''

    # Java Example
    java_code = '''
    public class DataService {
        private final String connectionUrl;

        public DataService(String connectionUrl) {
            this.connectionUrl = connectionUrl;
        }

        public void processData() throws Exception {
            try (Connection conn = getConnection()) {
                // Process data
            }
        }
    }
    '''

    # Process each language
    languages = {
        "python": python_code,
        "js": js_code,
        "java": java_code
    }

    for lang, code in languages.items():
        print(f"\nTesting {lang.upper()} splitting:")
        splitter = CodeSplitter(language=lang)
        chunks = splitter.split_code(code)
        for i, chunk in enumerate(chunks, 1):
            print(f"\nChunk {i}:")
            print(chunk)

def test_complex_structures():
    """Test handling of complex code structures"""

    # Nested functions and closures
    nested_code = '''
    def outer_function(x):
        """Handle outer logic."""

        def inner_function(y):
            """Process inner calculations."""
            return x + y

        return lambda z: inner_function(z)

    # Usage example
    calculator = outer_function(10)
    result = calculator(5)
    '''

    # Decorators
    decorator_code = '''
    def log_execution(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            print(f"Executing {func.__name__}")
            result = func(*args, **kwargs)
            print(f"Completed {func.__name__}")
            return result
        return wrapper

    @log_execution
    def process_data(data):
        """Process the input data."""
        return data.transform()
    '''

def test_documentation_handling():
    """Test handling of various documentation styles"""

    # Docstrings and type hints
    documented_code = '''
    def calculate_metrics(
        data: pd.DataFrame,
        metrics: List[str],
        groupby: Optional[str] = None
    ) -> Dict[str, float]:
        """
        Calculate specified metrics from the dataset.

        Args:
            data: Input DataFrame
            metrics: List of metrics to calculate
            groupby: Optional grouping column

        Returns:
            Dictionary of metric names and values

        Raises:
            ValueError: If invalid metric specified
        """
        results = {}
        # Implementation
        return results
    '''

def test_special_cases():
    """Test handling of special code cases"""

    # Template code
    template_code = '''
    def generate_query(table_name: str) -> str:
        return f"""
            SELECT *
            FROM {table_name}
            WHERE created_at >= '{{start_date}}'
            AND status = '{{status}}'
        """
    '''

    # Embedded DSL
    dsl_code = '''
    from sqlalchemy import select

    query = (
        select(User.id, User.name)
        .where(User.active == True)
        .order_by(User.created_at.desc())
    )
    '''

    # Preprocessor directives
    cpp_code = '''
    #ifndef DATA_PROCESSOR_H
    #define DATA_PROCESSOR_H

    class DataProcessor {
    public:
        DataProcessor();
        void process();
    private:
        void validate();
    };

    #endif
    '''

def run_all_examples():
    """Run all code splitting examples"""
    print("Testing Code Splitting Examples...")
    test_multilanguage_support()
    test_complex_structures()
    test_documentation_handling()
    test_special_cases()

if __name__ == "__main__":
    run_all_examples()