Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions services/chunking_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@


def _estimate_tokens_from_text(text: str) -> int:
"""
Estimate the number of tokens in the given text.
Uses tiktoken if available, otherwise falls back to a naive estimator.
"""
if tiktoken is not None:
try:
enc = tiktoken.get_encoding('gpt2')
Expand All @@ -31,6 +35,10 @@ def _estimate_tokens_from_text(text: str) -> int:


def _join_sentences(sentences: List[str], max_chars: int) -> List[str]:
"""
Join sentences into chunks, each not exceeding max_chars in length.
If a sentence is longer than max_chars, split it by character window.
"""
chunks = []
cur = []
cur_len = 0
Expand All @@ -57,6 +65,9 @@ def _join_sentences(sentences: List[str], max_chars: int) -> List[str]:


def _make_chunk_meta(text: str, offset: int, order: int) -> Dict[str, Any]:
"""
Create metadata dictionary for a chunk of text.
"""
return {
'id': uuid.uuid4().hex,
'order': order,
Expand Down