# 1 Load dữ liệu

In [1]:
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader 
from langchain_core.documents import Document

def load_file(path):
    if path.endswith('.pdf'): # PDF file
        pdf_loader = PyPDFLoader(path)
        pdf_docs = pdf_loader.load()
        return pdf_docs
    
    elif path.endswith('.docx'): # DOCX file
        docx_loader = Docx2txtLoader(path)
        docx_docs = docx_loader.load()
        return docx_docs
    
    elif path.endswith('.txt'): # TXT file
        text_loader = TextLoader(path, encoding='utf8')
        text_docs = text_loader.load()
        return text_docs
    
    else:
        raise ValueError("Unsupported file format")

  from .autonotebook import tqdm as notebook_tqdm


# 2 Tạo db

In [2]:
from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
import os
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)

load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

# 1 Load documents
docs = load_file(r"D:\GithubRepositories\RAG-system\docs\doc.txt")

# 2 Split documents into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

chunks = splitter.split_documents(docs)

# 3 Create embeddings and store in vector database
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={'device': device})
db = FAISS.from_documents(chunks, embeddings)
db.save_local("../db/faiss_index")

Device:  cuda


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

docs = load_file(r"D:\GithubRepositories\RAG-system\docs\Giai_thuat_va_Lap_trinh.pdf")
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
chunks = splitter.split_documents(docs)
print("Number of chunks: ", len(chunks))
db = FAISS.load_local("../db/faiss_index", embeddings, allow_dangerous_deserialization=True)
db.add_documents(chunks)
db.save_local("../db/faiss_index")
print("New documents added to the vector database.")

Number of chunks:  887
New documents added to the vector database.


# Khởi tạo RAG

In [23]:
from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv
import os
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

db = FAISS.load_local("../db/faiss_index", embeddings, allow_dangerous_deserialization=True)

retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})

model = GoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.7)

system_prompt = (
    """
        Hãy sử dụng đúng ngữ cảnh được cung cấp để trả lời câu hỏi.
        Nếu không tìm thấy câu trả lời trong ngữ cảnh, hãy nói "Thông tin này không có trong tài liệu được cung cấp.".
        Trả lời ngắn gọn tối đa ba câu. 
        Ngữ cảnh: {context}
    """
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(model, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)


In [19]:
query = "Merge sort là gì?"
result = chain.invoke({"input": query}) 
# Hàm invoke sẽ trả ra dict {"input": query, "answer": "...", "context": [...]}. 
# Trong đó, context là list các đoạn văn bản được trích dẫn từ tài liệu, answer là câu trả lời của model.
print(result["answer"])

# Trích dẫn từ tài liệu nào
for d in result["context"]:
    print("Source: ", d.metadata["source"], "Page: ", d.metadata["page"])

Thông tin này không có trong tài liệu được cung cấp.
Source:  D:\GithubRepositories\RAG-system\docs\Giai_thuat_va_Lap_trinh.pdf Page:  124
Source:  D:\GithubRepositories\RAG-system\docs\Giai_thuat_va_Lap_trinh.pdf Page:  132
Source:  D:\GithubRepositories\RAG-system\docs\Giai_thuat_va_Lap_trinh.pdf Page:  124


In [24]:
from langchain_community.retrievers import BM25Retriever

docs = load_file(r"D:\GithubRepositories\RAG-system\docs\Giai_thuat_va_Lap_trinh.pdf")
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
chunks = splitter.split_documents(docs)
bm25_retriever = BM25Retriever.from_documents(chunks)
bm25_retriever.k = 5

In [22]:
def rrf(docs_list, k=60):
    scores = {}
    for docs in docs_list:
        for rank, doc in enumerate(docs):
            doc_id = doc.metadata.get("id")
            if doc_id not in scores:
                scores[doc_id] = 0
            scores[doc_id] += 1 / (k + rank + 1)
    # Sort theo điểm RRF giảm dần
    sorted_ids = sorted(scores, key=scores.get, reverse=True)
    # Return tài liệu theo đúng thứ tự RRF
    id_to_doc = {doc.metadata["id"]: doc for docs in docs_list for doc in docs}
    return [id_to_doc[i] for i in sorted_ids]

In [None]:
def hybrid_retrieve(query):
    bm25_docs = bm25_retriever.get_relevant_documents(query)
    vector_docs = retriever.get_relevant_documents(query)

    docs_rrf = rrf([bm25_docs, vector_docs])
    return docs_rrf[:5]  # lấy top 5