### Sentence Based Chunnking

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

# Test sentence-based chunking
text = """Artificial Intelligence is transforming the world. 
Businesses are leveraging AI for better decision-making. 
The healthcare industry uses AI to diagnose diseases more efficiently. 
Meanwhile, in education, AI is personalizing learning experiences. 
Despite its benefits, ethical concerns about AI are on the rise."""

# Tokenize sentences
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(text)

### Paragraph Based Chunnking

In [12]:
def paragraph_chunking(text):
    """
    Chunk text into paragraphs based on newline characters.
    
    Args:
    text (str): The input text containing multiple paragraphs.
    
    Returns:
    list: A list of paragraphs.
    """
    # Split the text into paragraphs using two or more consecutive newlines
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    return paragraphs

# Chunk into paragraphs
paragraphs = paragraph_chunking(text)

In [13]:
# Print the results
print("Paragraph Chunks:")
for i, paragraph in enumerate(paragraphs, start=1):
    print(f"Paragraph {i}: {paragraph}")


Paragraph Chunks:
Paragraph 1: Artificial Intelligence is transforming the world. Businesses are leveraging AI for better decision-making. 
The healthcare industry uses AI to diagnose diseases more efficiently. Meanwhile, in education, AI is personalizing learning experiences. 
Despite its benefits, ethical concerns about AI are on the rise.


### Sentence & Paragraph Based Chunking in LangChain

In [14]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document

# Convert text into a Document object
document = Document(page_content=text)

# 1. Sentence-Based Chunking
def sentence_based_chunking(document):
    """
    Perform sentence-based chunking on the text.
    """
    sentence_splitter = CharacterTextSplitter(separator=". ", chunk_size=500, chunk_overlap=0)
    sentences = sentence_splitter.split_text(document.page_content)
    return sentences

In [15]:
# 2. Paragraph-Based Chunking
def paragraph_based_chunking(document):
    """
    Perform paragraph-based chunking on the text.
    """
    paragraph_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=500, chunk_overlap=0)
    paragraphs = paragraph_splitter.split_text(document.page_content)
    return paragraphs


In [16]:
# Perform sentence-based chunking
sentence_chunks = sentence_based_chunking(document)
print("Sentence-Based Chunking:")
for i, chunk in enumerate(sentence_chunks, start=1):
    print(f"Sentence {i}: {chunk}")

print("\n")

# Perform paragraph-based chunking
paragraph_chunks = paragraph_based_chunking(document)
print("Paragraph-Based Chunking:")
for i, chunk in enumerate(paragraph_chunks, start=1):
    print(f"Paragraph {i}: {chunk}")

Sentence-Based Chunking:
Sentence 1: Artificial Intelligence is transforming the world. Businesses are leveraging AI for better decision-making. 
The healthcare industry uses AI to diagnose diseases more efficiently. Meanwhile, in education, AI is personalizing learning experiences. 
Despite its benefits, ethical concerns about AI are on the rise.


Paragraph-Based Chunking:
Paragraph 1: Artificial Intelligence is transforming the world. Businesses are leveraging AI for better decision-making. 
The healthcare industry uses AI to diagnose diseases more efficiently. Meanwhile, in education, AI is personalizing learning experiences. 
Despite its benefits, ethical concerns about AI are on the rise.


In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document

# Convert text into a Document object
document = Document(page_content=text)

# 1. Sentence-Based Chunking
def sentence_based_chunking(document):
    """
    Perform sentence-based chunking on the text.
    """
    sentence_splitter = CharacterTextSplitter(separator=". ", chunk_size=500, chunk_overlap=0)
    sentences = sentence_splitter.split_text(document.page_content)
    return sentences


# 2. Paragraph-Based Chunking
def paragraph_based_chunking(document):
    """
    Perform paragraph-based chunking on the text.
    """
    paragraph_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=500, chunk_overlap=0)
    paragraphs = paragraph_splitter.split_text(document.page_content)
    return paragraphs