#Manual chunking

In [5]:
import re

def divide_code_into_functions(code):
    # Split the code into separate functions based on the 'def' keyword
    functions = re.split(r'(?=def )', code)

    # Remove empty entries and return the list of function chunks
    return [func for func in functions if func.strip()]

# Sample Python code
code = """
def example_function():
    This function adds two numbers.
    a = 10
    b = 20
    result = a + b
    return result

def another_function(x, y):
    This function multiplies two numbers.
    return x * y

def main():
    Main function to run the example code.
    # Calling the example function
    sum_result = example_function()
    print(f"The sum is: {sum_result}")

    # Calling the another function
    product_result = another_function(5, 7)
    print(f"The product is: {product_result}")

if __name__ == "__main__":
    main()
"""

# Segregate the code into chunks
function_chunks = divide_code_into_functions(code)

# Print each function separately
for i, func in enumerate(function_chunks, 1):
    print(f"Chunk {i}:\n")
    print(func)
    print("\n" + "="*60 + "\n")


Chunk 1:

def example_function():
    This function adds two numbers.
    a = 10
    b = 20
    result = a + b
    return result




Chunk 2:

def another_function(x, y):
    This function multiplies two numbers.
    return x * y




Chunk 3:

def main():
    Main function to run the example code.
    # Calling the example function
    sum_result = example_function()
    print(f"The sum is: {sum_result}")

    # Calling the another function
    product_result = another_function(5, 7)
    print(f"The product is: {product_result}")

if __name__ == "__main__":
    main()





In [1]:
!pip install llama_index



In [2]:
!pip install tree-sitter==0.21.3 tree-sitter-languages==1.10.2



In [4]:
!pip install tree-sitter-python

Collecting tree-sitter-python
  Downloading tree_sitter_python-0.23.6-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading tree_sitter_python-0.23.6-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (112 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.3/112.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tree-sitter-python
Successfully installed tree-sitter-python-0.23.6


In [3]:
from llama_index.core.node_parser import SimpleFileNodeParser
from llama_index.readers.file import FlatReader
from pathlib import Path

md_docs = FlatReader().load_data(Path("/content/sample_code.py"))

parser = SimpleFileNodeParser()
md_nodes = parser.get_nodes_from_documents(md_docs)

In [8]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser

PY_LANGUAGE = Language(tspython.language())

In [11]:
from llama_index.core.node_parser import CodeSplitter

splitter = CodeSplitter(
    language="python",
    chunk_lines=10,  # lines per chunk
    chunk_lines_overlap=5,  # lines overlap between chunks
    max_chars=150,  # max chars per chunk
)
nodes = splitter.get_nodes_from_documents(md_nodes)

In [12]:
nodes

[TextNode(id_='bf25afd5-0b48-486c-ad4f-a23f3ee03634', embedding=None, metadata={'filename': 'sample_code.py', 'extension': '.py'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='ad49ea9f-54f7-45fe-b0c2-4c7802995183', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'filename': 'sample_code.py', 'extension': '.py'}, hash='0a69d853687afcc156c55cbe549ac62e32e7286a07266f41e39b505ed3e35646')}, metadata_template='{key}: {value}', metadata_separator='\n', text='# sample_code.py\n\ndef example_function():\n    """This function adds two numbers."""\n    a = 10\n    b = 20\n    result = a + b\n    return result', mimetype='text/plain', start_char_idx=0, end_char_idx=142, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'),
 TextNode(id_='f9ac9da6-f72c-4981-a5a0-522104093ac8', embedding=None, metadata={'filename': 'sample_code.py', 'extension': '.py'}, excluded_embed_metadata_keys=[], exclude

In [13]:
# Pretty-print the output
from pprint import pprint

print(f"Total Chunks Created: {len(nodes)}\n")
for i, node in enumerate(nodes):
    print(f"Chunk {i+1}:")
    print("-" * 40)
    print(f"ID: {node.id_}")
    print(f"Filename: {node.metadata.get('filename', 'N/A')}")
    print(f"Extension: {node.metadata.get('extension', 'N/A')}")
    print(f"Start Index: {node.start_char_idx}, End Index: {node.end_char_idx}")
    print(f"Text Content:\n{node.text}")
    print("=" * 40, "\n")

Total Chunks Created: 7

Chunk 1:
----------------------------------------
ID: bf25afd5-0b48-486c-ad4f-a23f3ee03634
Filename: sample_code.py
Extension: .py
Start Index: 0, End Index: 142
Text Content:
# sample_code.py

def example_function():
    """This function adds two numbers."""
    a = 10
    b = 20
    result = a + b
    return result

Chunk 2:
----------------------------------------
ID: f9ac9da6-f72c-4981-a5a0-522104093ac8
Filename: sample_code.py
Extension: .py
Start Index: 144, End Index: 236
Text Content:
def another_function(x, y):
    """This function multiplies two numbers."""
    return x * y

Chunk 3:
----------------------------------------
ID: f17a077c-4d57-407a-a69b-5b9dae322695
Filename: sample_code.py
Extension: .py
Start Index: 238, End Index: 249
Text Content:
def main():

Chunk 4:
----------------------------------------
ID: 252aef18-e951-4ce9-9b49-bd5da5c72ed2
Filename: sample_code.py
Extension: .py
Start Index: 254, End Index: 369
Text Content:
"""Main functi

#langchain

In [1]:
%pip install -qU langchain-text-splitters

In [2]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

In [3]:
[e.value for e in Language]


['cpp',
 'go',
 'java',
 'kotlin',
 'js',
 'ts',
 'php',
 'proto',
 'python',
 'rst',
 'ruby',
 'rust',
 'scala',
 'swift',
 'markdown',
 'latex',
 'html',
 'sol',
 'csharp',
 'cobol',
 'c',
 'lua',
 'perl',
 'haskell',
 'elixir',
 'powershell']

In [4]:
RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON)

['\nclass ', '\ndef ', '\n\tdef ', '\n\n', '\n', ' ', '']

In [6]:
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)
python_docs = python_splitter.create_documents([code])
python_docs

[Document(metadata={}, page_content='def example_function():'),
 Document(metadata={}, page_content='This function adds two numbers.\n    a = 10'),
 Document(metadata={}, page_content='b = 20\n    result = a + b\n    return result'),
 Document(metadata={}, page_content='def another_function(x, y):'),
 Document(metadata={}, page_content='This function multiplies two numbers.'),
 Document(metadata={}, page_content='return x * y'),
 Document(metadata={}, page_content='def main():'),
 Document(metadata={}, page_content='Main function to run the example code.'),
 Document(metadata={}, page_content='# Calling the example function'),
 Document(metadata={}, page_content='sum_result = example_function()'),
 Document(metadata={}, page_content='print(f"The sum is: {sum_result}")'),
 Document(metadata={}, page_content='# Calling the another function'),
 Document(metadata={}, page_content='product_result = another_function(5, 7)'),
 Document(metadata={}, page_content='print(f"The product is: {produ

Here same sample python code is used which we have initialsed in first cell.