In [3]:
from typing import List
from dataclasses import dataclass
import tqdm as notebook_tqdm

In [4]:
@dataclass
class Chunk:
    doc_id: str
    chunk_id: int
    text: str


In [None]:
def sliding_window_chunk(text: str, chunk_size_words: int = 100, overlap_words: int = 40) -> List[str]:
    words = text.split()
    if not words:
        return []
    chunks = []
    step = max(1, chunk_size_words - overlap_words)
    for start in range(0, len(words), step):
        end = min(len(words), start + chunk_size_words)
        chunk_words = words[start:end]
        if len(chunk_words) < max(30, chunk_size_words // 4):
            break
        chunks.append(" ".join(chunk_words))
        if end >= len(words):
            break
    return chunks

In [15]:
from naive_rag import load_text_files
docs = load_text_files("data")
print(f"Loaded {len(docs)} documents")
print("\nFirst document sample:")
first_doc = list(docs.values())[0]
print(f"Length: {len(first_doc.split())} words")
print(f"Text: {first_doc[:200]}...")

Loaded 1 documents

First document sample:
Length: 213 words
Text: ICC Men’s T20 World Cup 2026 Overview

The ICC Men’s T20 World Cup 2026 is scheduled to begin on February 7, 2026.
The tournament will be co-hosted by India and Sri Lanka.
A total of 20 teams will par...


In [25]:
from naive_rag import load_text_files, sliding_window_chunk
docs = load_text_files("data")
text = list(docs.values())[0]
chunks = sliding_window_chunk(text, chunk_size_words=100, overlap_words=40)
for i,c in enumerate(chunks):
    print(i, "words=", len(c.split()))
    print(c)        # full chunk
    print("-"*60)

0 words= 100
ICC Men’s T20 World Cup 2026 Overview The ICC Men’s T20 World Cup 2026 is scheduled to begin on February 7, 2026. The tournament will be co-hosted by India and Sri Lanka. A total of 20 teams will participate in the competition. Team Changes and Controversies Bangladesh was replaced by Scotland in the tournament following Bangladesh’s refusal to play matches in India due to security concerns. Scotland was added to Group C, which includes England, Nepal, Italy, and the West Indies. Team Preparations The Oman men’s cricket team has expressed confidence ahead of the tournament. The team’s captain stated that
------------------------------------------------------------
1 words= 100
in India due to security concerns. Scotland was added to Group C, which includes England, Nepal, Italy, and the West Indies. Team Preparations The Oman men’s cricket team has expressed confidence ahead of the tournament. The team’s captain stated that a balanced mix of youth and experience will be ke