### Data Loading

In [14]:
# !pip install langchain
# !pip install pymupdf

### Text Splitter (chunking strategy)

In [15]:
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re

def is_valid_chunk_for_bert(text):
    """
    Check if a chunk is valid for BERT pre-training.
    - Should have at least 2 complete sentences
    - Should not be a half-cut sentence
    - Should have minimum length for meaningful content
    """
    # Remove extra whitespace
    text = text.strip()
    
    # Check minimum length (at least 100 characters for meaningful content)
    if len(text) < 100:
        return False
    
    # Count sentences (look for sentence endings)
    sentence_endings = re.findall(r'[.!?]+', text)
    if len(sentence_endings) < 2:
        return False
    
    # Check if text ends with a complete sentence
    if not re.search(r'[.!?]\s*$', text):
        return False
    
    # Check if text starts properly (not a fragment)
    if text[0].islower():  # Likely a sentence fragment
        return False
    
    return True

file_path = "../harrypotter.pdf"
docs_rule = []

try:
    with fitz.open(file_path) as pdf_doc:
        for page_num, page in enumerate(pdf_doc):
            # Extract text from the current page
            page_text = page.get_text()

            # Initialize a text splitter for this page.
            # We will split the text from one page and add the page number as metadata to each chunk.
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=500,
                chunk_overlap=200,
                separators=["\n\n", "\n", " ", ""]
            )
            
            # Split the text from the current page into chunks
            page_chunks = text_splitter.create_documents([page_text])

            # Filter and add metadata to each valid chunk
            for chunk in page_chunks:
                # Validate chunk for BERT pre-training
                if is_valid_chunk_for_bert(chunk.page_content):
                    chunk.metadata.update({
                        "source": file_path, 
                        "page_number": page_num + 1,
                        "c": "rule_based",  # Added metadata field 'c'
                        "ischunk": True  # Added ischunk field
                    })
                    docs_rule.append(chunk)

    print("Successfully loaded and chunked the book content from the PDF with page numbers.")
    print(f"Filtered chunks for BERT pre-training quality.")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please make sure the file exists.")
    exit()

# Let's print some information about the chunks to verify
print(f"Total number of valid chunks created: {len(docs_rule)}")
print("\nHere is the content of the first chunk:")
print("---------------------------------------")
print(docs_rule[0].page_content)
print("---------------------------------------")
print(f"First chunk metadata: {docs_rule[0].metadata}")

Successfully loaded and chunked the book content from the PDF with page numbers.
Filtered chunks for BERT pre-training quality.
Total number of valid chunks created: 178

Here is the content of the first chunk:
---------------------------------------
When Mr. and Mrs. Dursley woke up on the dull, gray Tuesday our story
starts, there was nothing about the cloudy sky outside to suggest that strange
and mysterious things would soon be happening all over the country. Mr.
Dursley hummed as he picked out his most boring tie for work, and Mrs.
Dursley gossiped away happily as she wrestled a screaming Dudley into his
high chair.
None of them noticed a large, tawny owl flutter past the window.
---------------------------------------
First chunk metadata: {'source': '../harrypotter.pdf', 'page_number': 2, 'c': 'rule_based', 'ischunk': True}


### Semantic Aware Chunking

In [None]:
!pip install --quiet langchain_experimental

In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
import re

def is_valid_chunk_for_bert(text):
    """
    Check if a chunk is valid for BERT pre-training.
    - Should have at least 2 complete sentences
    - Should not be a half-cut sentence
    - Should have minimum length for meaningful content
    """
    # Remove extra whitespace
    text = text.strip()
    
    # Check minimum length (at least 100 characters for meaningful content)
    if len(text) < 100:
        return False
    
    # Count sentences (look for sentence endings)
    sentence_endings = re.findall(r'[.!?]+', text)
    if len(sentence_endings) < 2:
        return False
    
    # Check if text ends with a complete sentence
    if not re.search(r'[.!?]\s*$', text):
        return False
    
    # Check if text starts properly (not a fragment)
    if text[0].islower():  # Likely a sentence fragment
        return False
    
    return True

# File path to your PDF
file_path = "../harrypotter.pdf"

# A list to store chunks
docs_semantic = []

try:
    with fitz.open(file_path) as pdf_doc:
        # Initialize HuggingFace embeddings
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        
        # Initialize semantic chunker with embeddings
        text_splitter = SemanticChunker(embeddings)

        for page_num, page in enumerate(pdf_doc):
            # Extract text from the current page
            page_text = page.get_text()

            # Skip empty pages
            if not page_text.strip():
                continue

            # Split the text into semantic chunks
            page_chunks = text_splitter.create_documents([page_text])

            # Filter and add metadata to each valid chunk
            for chunk in page_chunks:
                # Validate chunk for BERT pre-training
                if is_valid_chunk_for_bert(chunk.page_content):
                    chunk.metadata.update({
                        "source": file_path, 
                        "page_number": page_num + 1,
                        "c": "semantic",  # Added metadata field 'c'
                        "ischunk": True  # Added ischunk field
                    })
                    docs_semantic.append(chunk)

    print("✅ Successfully loaded and chunked the book content from the PDF with semantic awareness + page numbers.")
    print(f"Filtered chunks for BERT pre-training quality.")
except FileNotFoundError:
    print(f"❌ Error: The file '{file_path}' was not found. Please make sure the file exists.")
    exit()

# Print some information about the chunks to verify
print(f"Total number of valid chunks created: {len(docs_semantic)}")
print("\nHere is the content of the first chunk:")
print("---------------------------------------")
print(docs_semantic[0].page_content)
print("---------------------------------------")
print(f"First chunk metadata: {docs_semantic[0].metadata}")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


### Hierarchical Chunking Strategy
This strategy creates chunks at multiple levels of granularity and maintains parent-child relationships between them.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import re

def is_valid_chunk_for_bert(text):
    """
    Check if a chunk is valid for BERT pre-training.
    - Should have at least 2 complete sentences
    - Should not be a half-cut sentence
    - Should have minimum length for meaningful content
    """
    # Remove extra whitespace
    text = text.strip()
    
    # Check minimum length (at least 100 characters for meaningful content)
    if len(text) < 100:
        return False
    
    # Count sentences (look for sentence endings)
    sentence_endings = re.findall(r'[.!?]+', text)
    if len(sentence_endings) < 2:
        return False
    
    # Check if text ends with a complete sentence
    if not re.search(r'[.!?]\s*$', text):
        return False
    
    # Check if text starts properly (not a fragment)
    if text[0].islower():  # Likely a sentence fragment
        return False
    
    return True

def create_hierarchical_chunks(text, page_num, source_file):
    """
    Create hierarchical chunks with multiple levels of granularity.
    
    Args:
        text: The input text to chunk
        page_num: Page number for metadata
        source_file: Source file path for metadata
    
    Returns:
        List of Document objects with hierarchical metadata
    """
    chunks = []
    
    # Level 1: Large sections (based on multiple paragraphs)
    section_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=200,
        separators=["\n\n\n", "\n\n", "\n", " ", ""]
    )
    
    # Level 2: Medium chunks (paragraphs/subsections)
    paragraph_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    
    # Level 3: Small chunks (sentences/phrases)
    sentence_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=50,
        separators=[". ", "! ", "? ", "\n", " ", ""]
    )
    
    # Create level 1 chunks (sections)
    level1_chunks = section_splitter.split_text(text)
    
    for i, section_text in enumerate(level1_chunks):
        section_id = f"page_{page_num}_section_{i}"
        
        # Validate and create section-level chunk
        if is_valid_chunk_for_bert(section_text):
            section_chunk = Document(
                page_content=section_text,
                metadata={
                    "source": source_file,
                    "page_number": page_num,
                    "chunk_type": "section",
                    "chunk_level": 1,
                    "section_id": section_id,
                    "parent_id": f"page_{page_num}",
                    "chunk_index": i,
                    "c": "hierarchical_section",  # Added metadata field 'c'
                    "ischunk": True  # Added ischunk field
                }
            )
            chunks.append(section_chunk)
        
        # Create level 2 chunks (paragraphs) within this section
        level2_chunks = paragraph_splitter.split_text(section_text)
        
        for j, paragraph_text in enumerate(level2_chunks):
            paragraph_id = f"{section_id}_para_{j}"
            
            # Validate and create paragraph-level chunk
            if is_valid_chunk_for_bert(paragraph_text):
                paragraph_chunk = Document(
                    page_content=paragraph_text,
                    metadata={
                        "source": source_file,
                        "page_number": page_num,
                        "chunk_type": "paragraph",
                        "chunk_level": 2,
                        "paragraph_id": paragraph_id,
                        "parent_id": section_id,
                        "section_id": section_id,
                        "chunk_index": j,
                        "c": "hierarchical_paragraph",  # Added metadata field 'c'
                        "ischunk": True  # Added ischunk field
                    }
                )
                chunks.append(paragraph_chunk)
            
            # Create level 3 chunks (sentences) within this paragraph
            level3_chunks = sentence_splitter.split_text(paragraph_text)
            
            for k, sentence_text in enumerate(level3_chunks):
                sentence_id = f"{paragraph_id}_sent_{k}"
                
                # Validate and create sentence-level chunk
                if is_valid_chunk_for_bert(sentence_text):
                    sentence_chunk = Document(
                        page_content=sentence_text,
                        metadata={
                            "source": source_file,
                            "page_number": page_num,
                            "chunk_type": "sentence",
                            "chunk_level": 3,
                            "sentence_id": sentence_id,
                            "parent_id": paragraph_id,
                            "paragraph_id": paragraph_id,
                            "section_id": section_id,
                            "chunk_index": k,
                            "c": "hierarchical_sentence",  # Added metadata field 'c'
                            "ischunk": True  # Added ischunk field
                        }
                    )
                    chunks.append(sentence_chunk)
    
    return chunks

# Apply hierarchical chunking to the PDF
file_path = "../harrypotter.pdf"
docs_hierarchical = []

try:
    with fitz.open(file_path) as pdf_doc:
        for page_num, page in enumerate(pdf_doc):
            # Extract text from the current page
            page_text = page.get_text()
            
            # Skip empty pages
            if not page_text.strip():
                continue
            
            # Create hierarchical chunks for this page
            page_chunks = create_hierarchical_chunks(page_text, page_num + 1, file_path)
            docs_hierarchical.extend(page_chunks)

    print("✅ Successfully created hierarchical chunks from the PDF.")
    print(f"Filtered chunks for BERT pre-training quality.")
except FileNotFoundError:
    print(f"❌ Error: The file '{file_path}' was not found.")
    
# Print statistics about the hierarchical chunks
level_counts = {}
for chunk in docs_hierarchical:
    level = chunk.metadata.get("chunk_level", "unknown")
    chunk_type = chunk.metadata.get("chunk_type", "unknown")
    key = f"Level {level} ({chunk_type})"
    level_counts[key] = level_counts.get(key, 0) + 1

print(f"\\nTotal valid hierarchical chunks created: {len(docs_hierarchical)}")
print("\\nBreakdown by level:")
for level, count in sorted(level_counts.items()):
    print(f"  {level}: {count} chunks")

# Show example chunks from each level
print("\\n" + "="*50)
print("EXAMPLE CHUNKS FROM EACH LEVEL:")
print("="*50)

for level in [1, 2, 3]:
    example_chunk = next((chunk for chunk in docs_hierarchical 
                         if chunk.metadata.get("chunk_level") == level), None)
    if example_chunk:
        chunk_type = example_chunk.metadata.get("chunk_type", "unknown")
        print(f"\\nLevel {level} ({chunk_type}) Example:")
        print("-" * 30)
        print(f"Content: {example_chunk.page_content[:200]}...")
        print(f"Metadata: {example_chunk.metadata}")

✅ Successfully created hierarchical chunks from the PDF.
Filtered chunks for BERT pre-training quality.
\nTotal valid hierarchical chunks created: 427
\nBreakdown by level:
  Level 1 (section): 57 chunks
  Level 2 (paragraph): 99 chunks
  Level 3 (sentence): 271 chunks
EXAMPLE CHUNKS FROM EACH LEVEL:
\nLevel 1 (section) Example:
------------------------------
Content: M
 
CHAPTER  ONE
THE BOY WHO LIVED
r. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They were the
last people you’d expect to ...
Metadata: {'source': '../harrypotter.pdf', 'page_number': 1, 'chunk_type': 'section', 'chunk_level': 1, 'section_id': 'page_1_section_0', 'parent_id': 'page_1', 'chunk_index': 0, 'c': 'hierarchical_section', 'ischunk': True}
\nLevel 2 (paragraph) Example:
------------------------------
Content: M
 
CHAPTER  ONE
THE BOY WHO LIVED
r. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly 

### Saving the chunks

In [None]:
import pickle
from langchain.schema import Document
import os

# Create Chunk_files directory if it doesn't exist
os.makedirs("Chunk_files", exist_ok=True)

# Save rule-based chunks (if they exist)
if 'docs_rule' in locals() and docs_rule:
    file_path_rule = "Chunk_files/harry_potter_chunks_rule.pkl"
    try:
        with open(file_path_rule, "wb") as f:
            pickle.dump(docs_rule, f)
        print(f"Successfully saved {len(docs_rule)} rule-based chunks to '{file_path_rule}'.")
    except Exception as e:
        print(f"Error saving rule-based chunks: {e}")
else:
    print("No rule-based chunks to save (docs_rule not defined or empty).")

# Save semantic chunks (if they exist)
if 'docs_semantic' in locals() and docs_semantic:
    file_path_semantic = "Chunk_files/harry_potter_chunks_semantic.pkl"
    try:
        with open(file_path_semantic, "wb") as f:
            pickle.dump(docs_semantic, f)
        print(f"Successfully saved {len(docs_semantic)} semantic chunks to '{file_path_semantic}'.")
    except Exception as e:
        print(f"Error saving semantic chunks: {e}")
else:
    print("No semantic chunks to save (docs_semantic not defined or empty).")

# Save hierarchical chunks (if they exist)
if 'docs_hierarchical' in locals() and docs_hierarchical:
    file_path_hierarchical = "Chunk_files/harry_potter_chunks_hierarchical.pkl"
    try:
        with open(file_path_hierarchical, "wb") as f:
            pickle.dump(docs_hierarchical, f)
        print(f"Successfully saved {len(docs_hierarchical)} hierarchical chunks to '{file_path_hierarchical}'.")
    except Exception as e:
        print(f"Error saving hierarchical chunks: {e}")
else:
    print("No hierarchical chunks to save (docs_hierarchical not defined or empty).")

Successfully saved 1401 rule-based chunks to 'Chunk_files/harry_potter_chunks_rule.pkl'.
No semantic chunks to save (docs_semantic not defined or empty).
Successfully saved 3382 hierarchical chunks to 'Chunk_files/harry_potter_chunks_hierarchical.pkl'.
