### Data Loading

In [None]:
# !pip install langchain
# !pip install pymupdf

### Text Splitter (chunking strategy)

In [16]:
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = "../harrypotter.pdf"
docs_rule = []

try:
    with fitz.open(file_path) as pdf_doc:
        for page_num, page in enumerate(pdf_doc):
            # Extract text from the current page
            page_text = page.get_text()

            # Initialize a text splitter for this page.
            # We will split the text from one page and add the page number as metadata to each chunk.
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=500,
                chunk_overlap=200,
                separators=["\n\n", "\n", " ", ""]
            )
            
            # Split the text from the current page into chunks
            page_chunks = text_splitter.create_documents([page_text])

            # Add metadata to each chunk. We'll add the 1-based page number.
            for chunk in page_chunks:
                chunk.metadata.update({"source": file_path, "page_number": page_num + 1})
                docs_rule.append(chunk)

    print("Successfully loaded and chunked the book content from the PDF with page numbers.")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please make sure the file exists.")
    exit()

# Let's print some information about the chunks to verify
print(f"Total number of chunks created: {len(docs_rule)}")
print("\nHere is the content of the first chunk:")
print("---------------------------------------")
print(docs_rule[0].page_content)
print("---------------------------------------")

Successfully loaded and chunked the book content from the PDF with page numbers.
Total number of chunks created: 1401

Here is the content of the first chunk:
---------------------------------------
M
 
CHAPTER  ONE
THE BOY WHO LIVED
r. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They were the
last people you’d expect to be involved in anything strange or mysterious,
because they just didn’t hold with such nonsense.
Mr. Dursley was the director of a firm called Grunnings, which made drills.
He was a big, beefy man with hardly any neck, although he did have a very
---------------------------------------


### Semantic Aware Chunking

In [17]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings

# File path to your PDF
file_path = "../harrypotter.pdf"

# A list to store chunks
docs_semantic = []

try:
    with fitz.open(file_path) as pdf_doc:
        # Initialize HuggingFace embeddings
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        
        # Initialize semantic chunker with embeddings
        text_splitter = SemanticChunker(embeddings)

        for page_num, page in enumerate(pdf_doc):
            # Extract text from the current page
            page_text = page.get_text()

            # Skip empty pages
            if not page_text.strip():
                continue

            # Split the text into semantic chunks
            page_chunks = text_splitter.create_documents([page_text])

            # Add metadata to each chunk (source + page number)
            for chunk in page_chunks:
                chunk.metadata.update({"source": file_path, "page_number": page_num + 1})
                docs_semantic.append(chunk)

    print("✅ Successfully loaded and chunked the book content from the PDF with semantic awareness + page numbers.")
except FileNotFoundError:
    print(f"❌ Error: The file '{file_path}' was not found. Please make sure the file exists.")
    exit()

# Print some information about the chunks to verify
print(f"Total number of chunks created: {len(docs_semantic)}")
print("\nHere is the content of the first chunk:")
print("---------------------------------------")
print(docs_semantic[0].page_content)
print("---------------------------------------")

✅ Successfully loaded and chunked the book content from the PDF with semantic awareness + page numbers.
Total number of chunks created: 657

Here is the content of the first chunk:
---------------------------------------
M
 
CHAPTER  ONE
THE BOY WHO LIVED
r.
---------------------------------------


### Saving the chunks

In [20]:
import pickle
from langchain.schema import Document

# Assuming the 'docs' list is already created from the previous step.

file_path_rule = "../Chunking/harry_potter_chunks_rule.pkl"

try:
    with open(file_path_rule, "wb") as f: # 'wb' mode for writing in binary
        pickle.dump(docs_rule, f)
    print(f"Successfully saved {len(docs_rule)} chunks to '{file_path_rule}'.")
except Exception as e:
    print(f"Error saving file: {e}")

file_path_semantic = "../Chunking/harry_potter_chunks_semantic.pkl"

try:
    with open(file_path_semantic, "wb") as f: # 'wb' mode for writing in binary
        pickle.dump(docs_semantic, f)
    print(f"Successfully saved {len(docs_semantic)} chunks to '{file_path_semantic}'.")
except Exception as e:
    print(f"Error saving file: {e}")

Successfully saved 1401 chunks to '../Chunking/harry_potter_chunks_rule.pkl'.
Successfully saved 657 chunks to '../Chunking/harry_potter_chunks_semantic.pkl'.
