<a href="https://colab.research.google.com/github/Dhananjay-97/notebooks/blob/main/Chunking_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
sample_text = """
Introduction

Data Science is an interdisciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from structured and unstructured data. It draws from statistics, computer science, machine learning, and various data analysis techniques to discover patterns, make predictions, and derive actionable insights.

Data Science can be applied across many industries, including healthcare, finance, marketing, and education, where it helps organizations make data-driven decisions, optimize processes, and understand customer behaviors.

Overview of Big Data

Big data refers to large, diverse sets of information that grow at ever-increasing rates. It encompasses the volume of information, the velocity or speed at which it is created and collected, and the variety or scope of the data points being covered.

Data Science Methods

There are several important methods used in Data Science:

1. Regression Analysis
2. Classification
3. Clustering
4. Neural Networks

Challenges in Data Science

- Data Quality: Poor data quality can lead to incorrect conclusions.
- Data Privacy: Ensuring the privacy of sensitive information.
- Scalability: Handling massive datasets efficiently.

Conclusion

Data Science continues to be a driving force in many industries, offering insights that can lead to better decisions and optimized outcomes. It remains an evolving field that incorporates the latest technological advancements.
"""


In [None]:
def fixed_size_chunk(text, max_words=100):
    words = text.split()
    return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]

# Applying Fixed-Size Chunking
fixed_chunks = fixed_size_chunk(sample_text)
for chunk in fixed_chunks:
    print(chunk, '\n---\n')


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def sentence_chunk(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]

# Applying Sentence-Based Chunking
sentence_chunks = sentence_chunk(sample_text)
for chunk in sentence_chunks:
    print(chunk, '\n---\n')


In [None]:
def paragraph_chunk(text):
    paragraphs = text.split('\n\n')
    return paragraphs

# Applying Paragraph-Based Chunking
paragraph_chunks = paragraph_chunk(sample_text)
for chunk in paragraph_chunks:
    print(chunk, '\n---\n')


In [None]:
def semantic_chunk(text, max_len=200):
    doc = nlp(text)
    chunks = []
    current_chunk = []
    for sent in doc.sents:
        current_chunk.append(sent.text)
        if len(' '.join(current_chunk)) > max_len:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

# Applying Semantic-Based Chunking
semantic_chunks = semantic_chunk(sample_text)
for chunk in semantic_chunks:
    print(chunk, '\n---\n')


In [None]:
def modality_chunk(text, images=None, tables=None):
    # This function assumes you have pre-processed text, images, and tables
    text_chunks = paragraph_chunk(text)
    return {'text_chunks': text_chunks, 'images': images, 'tables': tables}

# Applying Modality-Specific Chunking
modality_chunks = modality_chunk(sample_text, images=['img1.png'], tables=['table1'])
print(modality_chunks)


In [None]:
def sliding_window_chunk(text, chunk_size=100, overlap=20):
    tokens = text.split()
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = ' '.join(tokens[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

# Applying Sliding Window Chunking
sliding_chunks = sliding_window_chunk(sample_text)
for chunk in sliding_chunks:
    print(chunk, '\n---\n')


In [None]:
def hierarchical_chunk(text, section_keywords):
    sections = []
    current_section = []
    for line in text.splitlines():
        if any(keyword in line for keyword in section_keywords):
            if current_section:
                sections.append("\n".join(current_section))
            current_section = [line]
        else:
            current_section.append(line)
    if current_section:
        sections.append("\n".join(current_section))
    return sections

# Applying Hierarchical Chunking
section_keywords = ["Introduction", "Overview", "Methods", "Conclusion"]
hierarchical_chunks = hierarchical_chunk(sample_text, section_keywords)
for chunk in hierarchical_chunks:
    print(chunk, '\n---\n')


In [None]:
def content_aware_chunk(text):
    chunks = []
    current_chunk = []
    for line in text.splitlines():
        if line.startswith(('##', '###', 'Introduction', 'Conclusion')):
            if current_chunk:
                chunks.append('\n'.join(current_chunk))
            current_chunk = [line]
        else:
            current_chunk.append(line)
    if current_chunk:
        chunks.append('\n'.join(current_chunk))
    return chunks

# Applying Content-Aware Chunking
content_chunks = content_aware_chunk(sample_text)
for chunk in content_chunks:
    print(chunk, '\n---\n')


In [None]:
import pandas as pd

def table_aware_chunk(table):
    return table.to_markdown()

# Sample table data
table = pd.DataFrame({
    "Name": ["John", "Alice", "Bob"],
    "Age": [25, 30, 22],
    "Occupation": ["Engineer", "Doctor", "Artist"]
})

# Applying Table-Aware Chunking
table_markdown = table_aware_chunk(table)
print(table_markdown)


In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def token_based_chunk(text, max_tokens=200):
    tokens = tokenizer(text)["input_ids"]
    chunks = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
    return [tokenizer.decode(chunk) for chunk in chunks]

# Applying Token-Based Chunking
token_chunks = token_based_chunk(sample_text)
for chunk in token_chunks:
    print(chunk, '\n---\n')


In [None]:
def entity_based_chunk(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    return entities

# Applying Entity-Based Chunking
entity_chunks = entity_based_chunk(sample_text)
print(entity_chunks)


In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

def topic_based_chunk(text, num_topics=2):
    vectorizer = CountVectorizer()
    text_matrix = vectorizer.fit_transform([text])
    lda = LatentDirichletAllocation(n_components=num_topics)
    lda.fit(text_matrix)
    topics = lda.transform(text_matrix)
    return topics

# Applying Topic-Based Chunking
topic_chunks = topic_based_chunk(sample_text)
print(topic_chunks)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

def topic_based_chunk(text, num_topics=3):
    # Split the text into sentences for chunking
    sentences = text.split('. ')

    # Vectorize the sentences
    vectorizer = CountVectorizer()
    sentence_vectors = vectorizer.fit_transform(sentences)

    # Apply LDA for topic modeling
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(sentence_vectors)

    # Get the topic-word distribution
    topic_word = lda.components_
    vocabulary = vectorizer.get_feature_names_out()

    # Identify the top words for each topic
    topics = []
    for topic_idx, topic in enumerate(topic_word):
        top_words_idx = topic.argsort()[:-6:-1]
        topic_keywords = [vocabulary[i] for i in top_words_idx]
        topics.append("Topic {}: {}".format(topic_idx + 1, ', '.join(topic_keywords)))

    # Generate chunks with topics
    chunks_with_topics = []
    for i, sentence in enumerate(sentences):
        topic_assignments = lda.transform(vectorizer.transform([sentence]))
        assigned_topic = np.argmax(topic_assignments)
        chunks_with_topics.append((topics[assigned_topic], sentence))

    return chunks_with_topics


# Get topic-based chunks
topic_chunks = topic_based_chunk(sample_text, num_topics=3)

# Display results
for topic, chunk in topic_chunks:
    print(f"{topic}: {chunk}\n")

In [None]:
def page_based_chunk(pages):
    # Split based on pre-processed page list (simulating PDF page text)
    return pages

# Sample pages
pages = ["Page 1 content", "Page 2 content", "Page 3 content"]

# Applying Page-Based Chunking
page_chunks = page_based_chunk(pages)
for chunk in page_chunks:
    print(chunk, '\n---\n')


In [None]:
def keyword_based_chunk(text, keywords):
    chunks = []
    current_chunk = []
    for line in text.splitlines():
        if any(keyword in line for keyword in keywords):
            if current_chunk:
                chunks.append('\n'.join(current_chunk))
            current_chunk = [line]
        else:
            current_chunk.append(line)
    if current_chunk:
        chunks.append('\n'.join(current_chunk))
    return chunks

# Applying Keyword-Based Chunking
keywords = ["Introduction", "Conclusion", "Methods"]
keyword_chunks = keyword_based_chunk(sample_text, keywords)
for chunk in keyword_chunks:
    print(chunk, '\n---\n')


In [None]:
def hybrid_chunk(text):
    paragraphs = paragraph_chunk(text)
    hybrid_chunks = []
    for paragraph in paragraphs:
        hybrid_chunks += sentence_chunk(paragraph)
    return hybrid_chunks

# Applying Hybrid Chunking
hybrid_chunks = hybrid_chunk(sample_text)
for chunk in hybrid_chunks:
    print(chunk, '\n---\n')


In [None]:
section_keywords = ["Introduction", "Overview", "Conclusion"]

# Define keywords for keyword-based chunking
keywords = ["Introduction", "Overview", "Conclusion", "Methods", "Challenges"]

# Updated wrapper function to call and display chunking strategies
def apply_chunking_methods(text, strategies, section_keywords=None, keywords=None):
    for strategy in strategies:
        print(f"\n--- Applying {strategy.__name__} ---\n")

        # Check if the strategy requires additional arguments
        if strategy == hierarchical_chunk and section_keywords is not None:
            chunks = strategy(text, section_keywords)
        elif strategy == keyword_based_chunk and keywords is not None:
            chunks = strategy(text, keywords)
        else:
            chunks = strategy(text)

        for idx, chunk in enumerate(chunks):
            print(f"Chunk {idx+1}:\n{chunk}\n{'-'*50}")

# List of chunking strategies (functions) to apply
strategies = [
    fixed_size_chunk,
    sentence_chunk,
    paragraph_chunk,
    semantic_chunk,
    sliding_window_chunk,
    hierarchical_chunk,  # Now properly handled
    token_based_chunk,
    entity_based_chunk,
    topic_based_chunk,
    page_based_chunk,
    keyword_based_chunk,  # Now properly handled
    hybrid_chunk,
]


In [None]:
apply_chunking_methods(sample_text, strategies, section_keywords, keywords)