# PDF RAG System with Vietnamese Language Support

This notebook implements a Retrieval-Augmented Generation (RAG) system for PDF documents with support for Vietnamese language using:
- LangChain for the RAG pipeline
- Qdrant for vector storage
- Google Gemini for embedding and generation
- PyPDF and PyMuPDF for PDF processing

## Install Required Packages

First, let's install all necessary packages:

## Import Dependencies

In [None]:
# -----------------------------
# üîë  PLACEHOLDER CONFIG
# -----------------------------
import os

# -- Google Gemini / Gemini 1.5-flash ‚Äì
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "your_api_key")
# -- Jina AI Embeddings ‚Äì
JINA_API_KEY = os.getenv("JINA_API_KEY", "your_api_key")
# Put the keys in env so downstream libs pick them up automatically
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
os.environ["JINA_API_KEY"] = JINA_API_KEY
# -----------------------------

import tempfile
import fitz  # PyMuPDF
from typing import List
import google.generativeai as genai

# LangChain / LLM + Embeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.embeddings import JinaEmbeddings

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

from langchain_qdrant import Qdrant, QdrantVectorStore, RetrievalMode
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

genai.configure(api_key=GOOGLE_API_KEY)

# Initialise Jina embeddings up-front so we can reuse them everywhere
text_embeddings = JinaEmbeddings(
    jina_api_key=JINA_API_KEY, model_name="jina-embeddings-v3"
)
#Qdrant


  from .autonotebook import tqdm as notebook_tqdm


## PDF Processing Functions

We'll implement enhanced PDF extraction with support for Vietnamese text using PyMuPDF (fitz):

In [3]:
def extract_text_from_pdf(pdf_path: str) -> List[Document]:
    """
    Extract text from PDF with special handling for Vietnamese text.
    
    Args:
        pdf_path: Path to the PDF file
        
    Returns:
        List of Document objects with text content and metadata
    """
    documents = []
    
    try:
        # Open the PDF file using PyMuPDF
        pdf_document = fitz.open(pdf_path)
        
        # Process each page
        for page_num, page in enumerate(pdf_document):
            # Extract text from the page with improved handling for Vietnamese characters
            text = page.get_text("text")
            
            # Skip empty pages
            if not text.strip():
                continue
                
            # Create a Document object with metadata
            doc = Document(
                page_content=text,
                metadata={
                    "source": pdf_path,
                    "page_number": page_num + 1,
                    "total_pages": len(pdf_document)
                }
            )
            documents.append(doc)
            
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
    
    return documents

def split_documents(documents: List[Document], chunk_size: int = 1024
                    , chunk_overlap: int = 200) -> List[Document]:
    """
    Split documents into chunks for better processing.
    
    Args:
        documents: List of Document objects
        chunk_size: Size of each chunk in characters
        chunk_overlap: Overlap between chunks in characters
        
    Returns:
        List of Document objects split into chunks
    """
    # Create a text splitter optimized for Vietnamese
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
        keep_separator=True
    )
    
    # Split the documents
    return text_splitter.split_documents(documents)

## Extract text from scaned PDF

Implement scanned pdf to documents

In [4]:
import tqdm

In [5]:
from __future__ import annotations
from pathlib import Path
from typing import List, Literal, Optional
from pdf2image import convert_from_path
from langchain.docstore.document import Document
import pytesseract
from tqdm.auto import tqdm
import os

# poppler_path = r"D:\poppler\Library\bin"  # ƒê·∫£m b·∫£o r·∫±ng Poppler ƒë√£ ƒë∆∞·ª£c c√†i ƒë√∫ng ƒë∆∞·ªùng d·∫´n

def extract_scan_pdf(
    pdf_path: str | Path,
    *,
    lang: str = "vie",
    dpi: int = 300,
    poppler_path: Optional[str] = None,
    extra_tess_config: str = "--psm 6",
) -> List[Document]:
    """
    Chuy·ªÉn PDF ƒë∆∞·ª£c scan th√†nh list[Document] (LangChain).

    Parameters
    ----------
    pdf_path : str | Path
        ƒê∆∞·ªùng d·∫´n PDF.
    lang : str, default ``"vie"``
        M√£ ng√¥n ng·ªØ Tesseract (c√≥ th·ªÉ 'vie', 'eng+vie', ‚Ä¶).
    dpi : int, default 300
        ƒê·ªô ph√¢n gi·∫£i xu·∫•t ·∫£nh; cao h∆°n ‚Üí OCR ch√≠nh x√°c h∆°n nh∆∞ng ch·∫≠m h∆°n.
    poppler_path : str | None
        ƒê∆∞·ªùng d·∫´n th∆∞ m·ª•c ch·ª©a binary `pdftoppm` n·∫øu kh√¥ng c√≥ trong PATH (Windows).
    extra_tess_config : str
        Tham s·ªë c·∫•u h√¨nh b·ªï sung cho Tesseract (v√≠ d·ª• `--oem 1`, `--psm 4`).

    Returns
    -------
    List[Document]
        M·ªói trang PDF th√†nh m·ªôt `Document(page_content, metadata)`.
        Metadata g·ªìm `page` (b·∫Øt ƒë·∫ßu 1) v√† `source` (t√™n file).
    """
    pdf_path = Path(pdf_path).expanduser().resolve()
    if not pdf_path.exists():
        raise FileNotFoundError(f"Kh√¥ng t√¨m th·∫•y file: {pdf_path}")

    # 1) PDF ‚ûú h√¨nh ·∫£nh
    print("Converting PDF to Image")

    poppler_path = r"D:\poppler\Library\bin"
    try:
        images = convert_from_path(
            pdf_path.as_posix(),
            dpi=dpi,
            poppler_path=poppler_path,
        )
    except Exception as e:
        print(f"Error converting PDF to images: {e}")
        return []

    # 2) OCR t·ª´ng trang
    print("OCR each Pages")
    docs: List[Document] = []
    for idx, img in enumerate(tqdm(images, desc="üîç OCR pages", unit="page"), start=1):
        try:
            text = pytesseract.image_to_string(
                img, lang=lang, config=extra_tess_config
            )
        except Exception as e:
            print(f"Error OCR page {idx}: {e}")
            text = ""  # N·∫øu c√≥ l·ªói OCR, b·ªè qua trang n√†y

        docs.append(
            Document(
                page_content=text,
                metadata={"page": idx, "source": pdf_path.name},
            )
        )
    
    return docs


  from .autonotebook import tqdm as notebook_tqdm


## Create Vector Store with Qdrant

In [6]:
from qdrant_client.http.exceptions import UnexpectedResponse
# gi·∫£ s·ª≠ b·∫°n ƒë√£ c√≥ `text_embeddings` v√† `Document`, `Qdrant` ƒë∆∞·ª£c import

def create_vector_store(
    documents: List[Document],
    collection_name: str = "vietnamese_book_pdf_vectors",
    recreate: bool = False,      # ‚Üê th√™m tu·ª≥ ch·ªçn
) -> Qdrant:
    """
    T·∫°o (ho·∫∑c t√°i s·ª≠ d·ª•ng) vector store Qdrant.
    
    Args:
        documents: Danh s√°ch Document ƒë·ªÉ index
        collection_name: T√™n collection trong Qdrant
        recreate: True => xo√° & t·∫°o l·∫°i; False => d√πng collection c≈© n·∫øu ƒë√£ t·ªìn t·∫°i
    """
    client = QdrantClient(url="http://localhost:6333")

    # ------------------------------------------------------------------
    # 1) Chu·∫©n b·ªã collection
    # ------------------------------------------------------------------
    try:
        if recreate:
            # Xo√° n·∫øu t·ªìn t·∫°i, r·ªìi t·∫°o m·ªõi
            client.recreate_collection(
                collection_name=collection_name,
                vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
            )
        else:
            # Ch·ªâ t·∫°o n·∫øu CH∆ØA c√≥
            if collection_name not in [
                c.name for c in client.get_collections().collections
            ]:
                client.create_collection(
                    collection_name=collection_name,
                    vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
                )
    except UnexpectedResponse as e:
        # B·∫Øt l·ªói 409 nh∆∞ng ƒë·ªÉ c√°c l·ªói kh√°c n·ªïi l√™n
        if getattr(e, "status_code", None) == 409:
            # Collection ƒë√£ t·ªìn t·∫°i & recreate=False ‚Üí b·ªè qua
            pass
        else:
            raise

    # ------------------------------------------------------------------
    # 2) Kh·ªüi t·∫°o Vector store v√† th√™m t√†i li·ªáu
    # ------------------------------------------------------------------
    vector_store = Qdrant(
        client=client,
        collection_name=collection_name,
        embeddings=text_embeddings,
    )
    vector_store.add_documents(documents)
    return vector_store

## Configure RAG Chain with Gemini

In [12]:
# ‚îÄ‚îÄ‚îÄ imports ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from langchain_google_genai import ChatGoogleGenerativeAI


# ‚îÄ‚îÄ‚îÄ ONE-SHOT QA (vector search + Gemini) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
def answer_question(
    vector_store,  # Qdrant (ho·∫∑c b·∫•t k·ª≥ VectorStore n√†o h·ªó tr·ª£ similarity_search)
    question: str,  # c√¢u h·ªèi c·ªßa user
    chat_history: list,  # list[BaseMessage] (HumanMessage / AIMessage)
    k: int = 10,  # s·ªë chunk l·∫•y t·ª´ vector search
):
    """
    1. vector_store.similarity_search -> l·∫•y k ƒëo·∫°n context
    2. Nh·ªìi context + l·ªãch s·ª≠ h·ªôi tho·∫°i v√†o prompt
    3. G·ªçi Gemini-flash, tr·∫£ v·ªÅ c√¢u tr·∫£ l·ªùi & c·∫≠p nh·∫≠t chat_history
    """

    # 1Ô∏è‚É£  L·∫•y context --------------------------------------------------------
    docs = vector_store.similarity_search(question, k=k)
    context = "\n\n".join(d.page_content for d in docs) or "Kh√¥ng c√≥ ng·ªØ c·∫£nh."

    # 2Ô∏è‚É£  X√¢y prompt ---------------------------------------------------------
    system_prompt = (
        "You are a helpful assistant answering from provided context. "
        "If the question is in Vietnamese, answer in Vietnamese with full "
        "diacritics. If the answer is not in the context, say you don't know.\n\n"
        f"Context:\n{context}"
    )

    # Danh s√°ch message: System + l·ªãch s·ª≠ + c√¢u h·ªèi m·ªõi
    messages = (
        [SystemMessage(system_prompt)] + chat_history + [HumanMessage(content=question)]
    )

    # 3Ô∏è‚É£  G·ªçi Gemini ---------------------------------------------------------
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-flash-preview-04-17",
        temperature=0.5,
        additional_kwargs={"generation_config": {"top_p": 0.95, "top_k": 40}},
    )
    print(messages)
    ai_msg: AIMessage = llm.invoke(messages)

    # 4Ô∏è‚É£  C·∫≠p nh·∫≠t l·ªãch s·ª≠ & tr·∫£ v·ªÅ -----------------------------------------
    chat_history.extend([HumanMessage(content=question), ai_msg])
    return ai_msg.content  # ho·∫∑c return ai_msg n·∫øu b·∫°n c·∫ßn full object


## File Upload and Processing

In [16]:
# ‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
# ‚ïë  üöÄ  Build the index from a local PDF (no widget, no upload)   ‚ïë
# ‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
import os, tempfile, pathlib


def build_index_from_local(pdf_path: str):
    """
    Point to any local PDF, then extracts, chunks, embeds and
    prepares the RAG chain.  Globals `vector_store` and `rag_chain`
    are created exactly like before.
    """
    pdf_path = pathlib.Path(pdf_path).expanduser().resolve()
    if not pdf_path.exists():
        raise FileNotFoundError(f"{pdf_path} not found")

    print(f"üìÑ  Processing: {pdf_path.name}")

    docs = extract_text_from_pdf(str(pdf_path))
    if len(docs) == 0:
        docs = extract_scan_pdf(str(pdf_path))
    print(f"- Extracted {len(docs)} pages")
    chunks = split_documents(docs)
    print(f"- Split into {len(chunks)} chunks")

    global vector_store
    vector_store = create_vector_store(chunks)

    print("\n‚úÖ  Ready for questions!")


# ‚ñ∂Ô∏è  CHANGE THIS TO WHATEVER PDF YOU WANTh
build_index_from_local("SGK_Toan9.pdf")


üìÑ  Processing: SGK_Toan9.pdf
Converting PDF to Image
OCR each Pages


üîç OCR pages: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 119/119 [04:34<00:00,  2.30s/page]


- Extracted 119 pages
- Split into 207 chunks

‚úÖ  Ready for questions!


## Question Answering Interface

In [18]:
# Initialize chat history for conversation
chat_history = []
# Example usage
# Replace with your actual question
question = "H√£y cho t√¥i bi·∫øt h·ªá th·ª©c l∆∞·ª£ng trong tam gi√°c vu√¥ng?"  # "What is the main content of this document?"
answer = answer_question(vector_store, question, chat_history)
print(f"Question: {question}")
print(f"Answer: {answer}")

[SystemMessage(content='You are a helpful assistant answering from provided context. If the question is in Vietnamese, answer in Vietnamese with full diacritics. If the answer is not in the context, say you don\'t know.\n\nContext:\nkhi xu√¥ng d√¥c l√† 19 kmih.\nSau b√†i h·ªçc n√†y, em ƒë√£ l√†m ƒë∆∞·ª£c nh·ªØng g√¨?\n~ Gi·∫£i th√≠ch ƒë∆∞·ª£c m·ªôt s·ªë h·ªá th·ª©c v·ªÅ c·∫°nh v√† g√≥c trong tam gi√°c vu√¥ng (c·∫°nh g√≥c vu√¥ng\nb·∫±ng c·∫°nh huy·ªÅn nh√†n v·ªõi sin g√≥c ƒë·ªëi ho·∫∑c nh√†n v·ªõi c√¥sin g√≥c k·ªÉ; c·∫°nh g√≥c vu√¥ng b·∫±ng\nc·∫°nh g√≥c vu√¥ng c√≤n l·∫°i nh√¢n v·ªõi tang g√≥c ƒë·ªëi ho·∫∑c nh√¢n v·ªõi c√¥tang g√≥c k·ªÅ).\n~ Gi·∫£i quy·∫øt ƒë∆∞·ª£c m·ªôt s·ªë v·∫•n ƒë·ªÅ th·ª±c ti·ªÖn g·∫Øn v·ªõi t·ªâ s·ªë l∆∞·ª£ng gi√°c c·ªßa g√≥c nh·ªçn (t√≠nh ƒë·ªô d√†i\nƒëoan th·∫≥ng, ƒë·ªô l·ªõn g√≥c; √°p d·ª•ng gi·∫£i tam gi√°c vu√¥ng).\n\n` ma N HU·∫æ TU ch·∫ø ‚Äî‚Äú-.6·∫ø""\nPh·∫ßn HIVH H (V√Ä. | l) |\nPh·∫ßn HIYH H·ª§U V√Ä I) LUUNG\n|\n0h∆∞∆°ng H·ªÜ TH·ª® L∆Ø·ª¢NG TR≈®NG\nIRM B1