In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import VectorStoreIndex, StorageContext

In [None]:
reader = SimpleDirectoryReader(input_dir="data")
docs = reader.load_data()
print(f"จำนวนไฟล์: {len(docs)}")

In [None]:
splitter_small = TokenTextSplitter(chunk_size=32, chunk_overlap=8)
splitter_large = TokenTextSplitter(chunk_size=128, chunk_overlap=16)

nodes_small = splitter_small.get_nodes_from_documents(docs)
nodes_large = splitter_large.get_nodes_from_documents(docs)


In [None]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-m3")


In [None]:
faiss_store_small = FaissVectorStore(dim=embed_model.embed_dim)
storage_context_small = StorageContext.from_defaults(vector_store=faiss_store_small)
index_small = VectorStoreIndex(
    nodes_small, embed_model=embed_model, storage_context=storage_context_small
)

In [None]:
faiss_store_large = FaissVectorStore(dim=embed_model.embed_dim)
storage_context_large = StorageContext.from_defaults(vector_store=faiss_store_large)
index_large = VectorStoreIndex(
    nodes_large, embed_model=embed_model, storage_context=storage_context_large
)

# SEARCHING


from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.prompts.default_prompts import DEFAULT_SIMPLE_INPUT_PROMPT

In [None]:
question = "RAG สำคัญกับ AI อย่างไร"

retriever_small = VectorIndexRetriever(index=index_small, similarity_top_k=2)
retriever_large = VectorIndexRetriever(index=index_large, similarity_top_k=2)

engine_small = RetrieverQueryEngine(retriever=retriever_small, input_prompt=DEFAULT_SIMPLE_INPUT_PROMPT)
engine_large = RetrieverQueryEngine(retriever=retriever_large, input_prompt=DEFAULT_SIMPLE_INPUT_PROMPT)

response_small = engine_small.query(question)
response_large = engine_large.query(question)

print("[FAISS][chunk_size=32] คำตอบ:", response_small.response)
print("\nSource nodes:")
for ctx in response_small.source_nodes:
    print("-", ctx.get_content())

print("\n----------------------\n")
print("[FAISS][chunk_size=128] คำตอบ:", response_large.response)
print("\nSource nodes:")
for ctx in response_large.source_nodes:
    print("-", ctx.get_content())