### Data Loading

In [None]:
# !pip install langchain
# !pip install pymupdf

In [4]:
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = "../harrypotter.pdf"
docs = []

### Text Splitter (chunking strategy)

In [None]:
try:
    with fitz.open(file_path) as pdf_doc:
        for page_num, page in enumerate(pdf_doc):
            # Extract text from the current page
            page_text = page.get_text()

            # Initialize a text splitter for this page.
            # We will split the text from one page and add the page number as metadata to each chunk.
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=500,
                chunk_overlap=200,
                separators=["\n\n", "\n", " ", ""]
            )
            
            # Split the text from the current page into chunks
            page_chunks = text_splitter.create_documents([page_text])

            # Add metadata to each chunk. We'll add the 1-based page number.
            for chunk in page_chunks:
                chunk.metadata.update({"source": file_path, "page_number": page_num + 1})
                docs.append(chunk)

    print("Successfully loaded and chunked the book content from the PDF with page numbers.")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please make sure the file exists.")
    exit()

# Let's print some information about the chunks to verify
print(f"Total number of chunks created: {len(docs)}")
print("\nHere is the content of the first chunk:")
print("---------------------------------------")
print(docs[0].page_content)
print("---------------------------------------")

### Semantic Aware Chunking

In [6]:
import fitz  # PyMuPDF
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings

# File path to your PDF
file_path = "../harrypotter.pdf"

# A list to store chunks
docs = []

try:
    with fitz.open(file_path) as pdf_doc:
        # Initialize HuggingFace embeddings
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        
        # Initialize semantic chunker with embeddings
        text_splitter = SemanticChunker(embeddings)

        for page_num, page in enumerate(pdf_doc):
            # Extract text from the current page
            page_text = page.get_text()

            # Skip empty pages
            if not page_text.strip():
                continue

            # Split the text into semantic chunks
            page_chunks = text_splitter.create_documents([page_text])

            # Add metadata to each chunk (source + page number)
            for chunk in page_chunks:
                chunk.metadata.update({"source": file_path, "page_number": page_num + 1})
                docs.append(chunk)

    print("✅ Successfully loaded and chunked the book content from the PDF with semantic awareness + page numbers.")
except FileNotFoundError:
    print(f"❌ Error: The file '{file_path}' was not found. Please make sure the file exists.")
    exit()

# Print some information about the chunks to verify
print(f"Total number of chunks created: {len(docs)}")
print("\nHere is the content of the first chunk:")
print("---------------------------------------")
print(docs[0].page_content)
print("---------------------------------------")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


✅ Successfully loaded and chunked the book content from the PDF with semantic awareness + page numbers.
Total number of chunks created: 657

Here is the content of the first chunk:
---------------------------------------
M
 
CHAPTER  ONE
THE BOY WHO LIVED
r.
---------------------------------------


### Saving the chunks

In [7]:
import pickle
from langchain.schema import Document

# Assuming the 'docs' list is already created from the previous step.

file_path = "harry_potter_chunks.pkl"

try:
    with open(file_path, "wb") as f: # 'wb' mode for writing in binary
        pickle.dump(docs, f)
    print(f"Successfully saved {len(docs)} chunks to '{file_path}'.")
except Exception as e:
    print(f"Error saving file: {e}")

Successfully saved 657 chunks to 'harry_potter_chunks.pkl'.
