In [1]:
import logging
import sys
from llama_index.core.response.notebook_utils import display_response

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

Load the PDF

In [2]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.readers.file import UnstructuredReader
import uuid
import os


input_dir = "./data"
reader = UnstructuredReader()
documents = []

for root, dirs, files in os.walk(input_dir):
    for file in files:
        if file.lower().endswith(".pdf"):
            file_path = os.path.join(root, file)
            # read the PDF text
            text = reader.load_data(file_path)  # returns a list of Documents
            for doc in text:
                doc.doc_id = str(uuid.uuid4())
                # Save only the PDF file name
                doc.extra_info = {"source_path": file}
                documents.append(doc)

documents

  from .autonotebook import tqdm as notebook_tqdm


INFO:pikepdf._core:pikepdf C++ to Python logger bridge initialized
pikepdf C++ to Python logger bridge initialized
'doc_id' is deprecated and 'id_' will be used instead
'doc_id' is deprecated and 'id_' will be used instead


[Document(id_='2bb98ea3-2eae-4385-8f2b-a0dfcc84a424', embedding=None, metadata={'source_path': 'New Thesis Manuscript Guidelines.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Guidelines Format of the Thesis Study\n\nGENERAL FORMAT\n\nPaper Size\n\nManuscripts shall be printed in “A4” size: 297mm x 210mm white bond paper and a weight not less than 80 grams per square meter (gsm).\n\nPage Margin\n\nThe measurement of margins shall be reckoned from the edge of the page. The following margins shall apply:\n\nTop: 1 inch (2.54 cm) Bottom: 1 inch (2.54 cm) Right: 1 inch (2.54 cm) Left: 1.5 inches (3.81 cm)\n\nFont Characteristics\n\nThe following font characteristics shall be applied to the whole manuscript (including page numbers):\n\nFace: Size: Line Spacing: Double Color: Alignment: Style: Scale: Spacing: Position: Page numbe

In [3]:
from sentence_transformers import SentenceTransformer
from llama_index.core.embeddings import BaseEmbedding
from llama_index.core import StorageContext, Settings
from pydantic import PrivateAttr


class MyEmbeddings(BaseEmbedding):
    _model: any = PrivateAttr()

    def __init__(self, model):
        super().__init__()
        self._model = model  # HuggingFace model

    def _get_text_embedding(self, text: str):
        return self._model.encode(text).tolist()

    def _get_query_embedding(self, query: str):
        return self._model.encode(query).tolist()

    async def _aget_text_embedding(self, text: str):
        return self._get_text_embedding(text)

    async def _aget_query_embedding(self, query: str):
        return self._get_query_embedding(query)
    
hf_model = SentenceTransformer("all-MiniLM-L6-v2")
embed_model = MyEmbeddings(hf_model)

# register your embedding model globally
Settings.embed_model = embed_model
# Explicitly disable LLM usage
Settings.llm = None

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Load pretrained SentenceTransformer: all-MiniLM-L6-v2
LLM is explicitly disabled. Using MockLLM.


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

class PdfBasedCustomSplitter:
    """
    Custom text splitter for PDF documents that splits text recursively
    using a combination of separators and a maximum token limit with overlap.
    """
    def __init__(self, chunk_overlap_rate: float, max_tokens: int = 1000, separators: list[str] | None = None):
        """
        :param chunk_overlap_rate: Fraction of overlap between chunks (0.0 - 1.0)
        :param max_tokens: Approximate maximum number of tokens per chunk
        :param separators: List of separators for recursive splitting
        """
        self.chunk_overlap_rate = chunk_overlap_rate
        self.max_tokens = max_tokens
        self.separators = separators or ["\n\n", "\n", " ", ""]

    def split_text(self, text: str) -> list[str]:
        # Estimate number of splits based on tokens
        tokens = text.split()
        num_splits = (len(tokens) // self.max_tokens) + 1
        chunk_size = max(1, len(text) // num_splits)
        chunk_overlap = int(chunk_size * self.chunk_overlap_rate)

        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=self.separators,
        )
        return splitter.split_text(text)


Context Augmented

In [5]:
from llama_index.core.schema import TextNode


class ContextAugmentNodeProcessor:
    def __init__(self, whole_text: str):
        self.whole_text = whole_text

    def process_node(self, node: TextNode) -> TextNode:
        # Store the local chunk in metadata
        node.metadata["chunk_text"] = node.text
        # Augment the node text with the whole PDF context
        node.text = self.whole_text + "\n\n" + node.text
        return node

In [6]:
splitter = PdfBasedCustomSplitter(chunk_overlap_rate=0.1, max_tokens=500)
nodes = []
for doc in documents:
    chunks = splitter.split_text(doc.text)
    
    postprocessor = ContextAugmentNodeProcessor(doc.text)
    nodes.extend([postprocessor.process_node(TextNode(text=chunk)) for chunk in chunks])


In [7]:
import weaviate
from llama_index.vector_stores.weaviate import WeaviateVectorStore


connection_config = {"port": 8080, "grpc_port": 50051, "skip_init_checks": True}
client = weaviate.connect_to_local(**connection_config)
client.collections.delete_all()
vector_store = WeaviateVectorStore(weaviate_client=client, index_name="DocumentChunk")
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(nodes=nodes, storage_context=storage_context)

INFO:httpx:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/schema "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:8080/v1/schema "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: DELETE http://localhost:8080/v1/schema/DocumentChunk "HTTP/1.1 200 OK"
HTTP Request: DELETE http://localhost:8080/v1/schema/DocumentChunk "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/schema/DocumentChunk "HTTP/1.1 404 Not Found"
HTTP Request: GET http://localhost:8080/v1/schema/DocumentChunk "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: POST http://localhost:8080/v1/schema "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:8080/v1/schema "HTTP/1.1 200 OK"


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 12.33it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.13it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 12.57it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 12.22it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 16.60it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 19.87it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.17it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 19.43it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.10it/s]

INFO:httpx:HTTP Request: GET http://localhost:8080/v1/schema "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:8080/v1/schema "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/nodes "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:8080/v1/nodes "HTTP/1.1 200 OK"





INFO:httpx:HTTP Request: GET http://localhost:8080/v1/nodes "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:8080/v1/nodes "HTTP/1.1 200 OK"


In [8]:
from llama_index.llms.ollama import Ollama


# Example: using llama3 model locally
llm = Ollama(model="gemma3:1b", base_url="http://localhost:11434",request_timeout=300)



In [9]:
from llama_index.core.chat_engine import CondenseQuestionChatEngine

# First build a query engine
query_engine = index.as_query_engine(
    llm=llm,
    similarity_top_k=2
)

# Then wrap it with chat engine (streaming enabled)
chat_engine = CondenseQuestionChatEngine.from_defaults(
    query_engine=query_engine,
    llm=llm,
    streaming=True
)

# Run streaming chat
response_stream = chat_engine.stream_chat("Tell me about Japan visa")

print("\n--- Streaming Answer ---\n")
for token in response_stream.response_gen:
    print(token, end="", flush=True)

print("\n\n✅ Done")


INFO:httpx:HTTP Request: POST http://localhost:11434/api/show "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/show "HTTP/1.1 200 OK"
INFO:llama_index.core.chat_engine.condense_question:Querying with: Tell me about Japan visa
Querying with: Tell me about Japan visa


Batches: 100%|██████████| 1/1 [00:00<00:00, 69.40it/s]


--- Streaming Answer ---






INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
The validity of Single Entry visa is 3 months, for Multiple Entry is either 5 years or 10 years. It is still be the discretion of the embassy. Your multiple entry may not be granted and can be Single Entry. Passport is a very important document that need to provide. You need to sign your passport when applying for visa. If your old passport that have Japan Visa is lost you need to still acknowledge it but it will be a lost passport in your application.

All the payments that have been done is not refundable even if you are not granted a Multiple Entry.

Results cannot be done over the phone, the results will only be known when the applicants pick up their passports.

If you want to apply for long term in Japan the person you know may help you process a Certificate of Eligibility in the Japan.

All Japanese documents can be a original or sc

In [10]:
import json


# Specify the class name
class_name = "DocumentChunk"

# Access the collection
collection = client.collections.use(class_name)

# Initialize a list to store all chunks
all_chunks = []

# Iterate over all objects in the collection
for obj in collection.iterator():
    all_chunks.append({
        "doc_id": str(obj.properties.get("doc_id")),
        "text": obj.properties.get("text"),
        "source": obj.properties.get("source"),
        "chunk_text": obj.properties.get("chunk_text")
    })

# Save the chunks to a JSON file
with open("weaviate_chunks.json", "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=2)

print(f"Saved {len(all_chunks)} chunks to weaviate_chunks.json")


Saved 10 chunks to weaviate_chunks.json


In [11]:
Settings.llm = None

query_engine = index.as_retriever(
    vector_store_query_mode="hybrid", similarity_top_k=2, llm=None
)
response = query_engine.retrieve(
    "fairview",
)
response

LLM is explicitly disabled. Using MockLLM.


Batches: 100%|██████████| 1/1 [00:00<00:00, 13.94it/s]


[NodeWithScore(node=TextNode(id_='7caaf97c-366f-4cd1-a3ed-1a4fcffd0dfb', embedding=[-0.009340844117105007, -0.04449395835399628, -0.006907307077199221, 0.00946510024368763, -0.030002763494849205, -0.01704428158700466, -0.029617222025990486, 0.013412820175290108, -0.033597636967897415, 0.038043685257434845, 0.02901093102991581, -0.011669104918837547, -0.08081034570932388, 0.0025478550232946873, 0.024813009425997734, 0.009965886361896992, 0.014109374023973942, -0.05593471601605415, -0.03745715692639351, 0.04233357682824135, 0.0657731145620346, -0.06729663908481598, -0.032790228724479675, 0.016060587018728256, -0.03519478812813759, -0.0814952626824379, -0.04516886547207832, 0.06121789291501045, -0.06065722182393074, -0.07139606028795242, 0.037777744233608246, 0.05656404048204422, -0.002618806902319193, 0.006189359817653894, 0.1027594730257988, -0.04652105271816254, -0.034342531114816666, -0.04316895455121994, 0.0032359270844608545, -0.11129771173000336, 0.0285821370780468, 0.0421647652983