In [1]:
def sliding_window_chunking(text, window_size, overlap_size):
    """
    Segments the input text into overlapping chunks.

    Args:
        text (str): The input text to chunk.
        window_size (int): The size of each chunk.
        overlap_size (int): The size of overlap between chunks.

    Returns:
        List[str]: List of overlapping text chunks.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = start + window_size
        chunks.append(text[start:end])
        start += window_size - overlap_size
    return chunks


In [2]:
sample_text = (
    "Data chunking is essential for handling large text streams. "
    "Sliding window techniques preserve crucial context."
)

window_size = 50
overlap_size = 20
chunks = sliding_window_chunking(sample_text, window_size, overlap_size)

for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}: {chunk}")

Chunk 1: Data chunking is essential for handling large text
Chunk 2:  handling large text streams. Sliding window techn
Chunk 3: Sliding window techniques preserve crucial context
Chunk 4: erve crucial context.


In [3]:
sample_document = (
    "Natural Language Processing (NLP) involves understanding and generating human language. "
    "It has applications in sentiment analysis, machine translation, and more. "
    "One challenge in NLP is segmenting long text streams into manageable pieces. "
    "Sliding window chunking helps in preserving the context by overlapping chunks. "
    "This approach is particularly useful in tasks like summarization and language modeling."
)

# Parameters
window_size = 70
overlap_size = 30

# Using the sliding_window_chunking function
chunks = sliding_window_chunking(sample_document, window_size, overlap_size)

# Display the chunks
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk}\n")

Chunk 1:
Natural Language Processing (NLP) involves understanding and generatin

Chunk 2:
es understanding and generating human language. It has applications in

Chunk 3:
nguage. It has applications in sentiment analysis, machine translation

Chunk 4:
 analysis, machine translation, and more. One challenge in NLP is segm

Chunk 5:
. One challenge in NLP is segmenting long text streams into manageable

Chunk 6:
g text streams into manageable pieces. Sliding window chunking helps i

Chunk 7:
liding window chunking helps in preserving the context by overlapping 

Chunk 8:
ng the context by overlapping chunks. This approach is particularly us

Chunk 9:
is approach is particularly useful in tasks like summarization and lan

Chunk 10:
sks like summarization and language modeling.

Chunk 11:
ling.



In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Sample document
sample_document = (
    "Natural Language Processing (NLP) involves understanding and generating human language. "
    "It has applications in sentiment analysis, machine translation, and more. "
    "One challenge in NLP is segmenting long text streams into manageable pieces. "
    "Sliding window chunking helps in preserving the context by overlapping chunks. "
    "This approach is particularly useful in tasks like summarization and language modeling."
)

# Define parameters
chunk_size = 70  # Window size
chunk_overlap = 30  # Overlap size

# Use RecursiveCharacterTextSplitter with sliding window settings
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

# Split the document into chunks
chunks = text_splitter.split_text(sample_document)

# Display the chunks
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk}\n")


Chunk 1:
Natural Language Processing (NLP) involves understanding and

Chunk 2:
involves understanding and generating human language. It has

Chunk 3:
human language. It has applications in sentiment analysis, machine

Chunk 4:
sentiment analysis, machine translation, and more. One challenge in

Chunk 5:
and more. One challenge in NLP is segmenting long text streams into

Chunk 6:
long text streams into manageable pieces. Sliding window chunking

Chunk 7:
Sliding window chunking helps in preserving the context by

Chunk 8:
in preserving the context by overlapping chunks. This approach is

Chunk 9:
chunks. This approach is particularly useful in tasks like

Chunk 10:
useful in tasks like summarization and language modeling.

