In [36]:
import os
import time
import traceback
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch


In [37]:
# Path to the directory containing the cleaned .txt files.
# Based on your file structure, the correct path is 'output/cleaned_text'.
DATA_PATH = "output/cleaned_text"

# Model for creating text embeddings. This model is efficient and effective.
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

# LLM for generating answers. FLAN-T5 is excellent for Q&A tasks.
LLM_MODEL = "google/flan-t5-base" # Use 'base' for faster performance, can be changed to 'large' for higher accuracy

# Path to save/load the vector store to avoid re-creating it every time.
DB_FAISS_PATH = "vectorstore/db_faiss"


In [38]:
def create_vector_db():
    """
    This function creates a FAISS vector store from the documents
    in the DATA_PATH directory.
    """
    try:
        print("--- Starting: Loading documents ---")
        loader = DirectoryLoader(DATA_PATH, glob="*.txt", show_progress=True)
        documents = loader.load()
        if not documents:
            print(f"FAILED: No documents found in {DATA_PATH}. Please check the path.")
            return None
        print(f"--- Finished: Loaded {len(documents)} documents ---")

        print("--- Starting: Splitting documents into chunks ---")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
        docs = text_splitter.split_documents(documents)
        print(f"--- Finished: Split into {len(docs)} chunks ---")

        print("--- Starting: Creating embeddings (this process can take time) ---")
        embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
        print("--- Finished: Embeddings model loaded ---")

        print("--- Starting: Creating and saving FAISS vector store ---")
        db = FAISS.from_documents(docs, embeddings)
        db.save_local(DB_FAISS_PATH)
        print(f"--- Finished: Vector store created and saved at {DB_FAISS_PATH} ---")
        return db
    except Exception as e:
        print("\n--- AN ERROR OCCURRED WHILE CREATING THE VECTOR DATABASE ---")
        traceback.print_exc()
        return None


In [39]:
def setup_qa_chain():
    """
    This function sets up the complete question-answering chain.
    """
    try:
        if os.path.exists(DB_FAISS_PATH):
            print("--- Loading existing vector store ---")
            embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
            db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
            print("--- Vector store loaded successfully ---")
        else:
            print("--- Vector store not found. Creating a new one. ---")
            db = create_vector_db()
            if db is None:
                return None

        print("--- Starting: Setting up LLM (this can take a long time on the first run) ---")
        tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
        model = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL)

        # Hapus parameter 'temperature' dan 'top_p' untuk menghindari warning
        pipe = pipeline(
            "text2text-generation",
            model=model,
            tokenizer=tokenizer,
            max_length=512,
            repetition_penalty=1.2
        )

        llm = HuggingFacePipeline(pipeline=pipe)
        print("--- Finished: LLM setup complete ---")

        # Buat Prompt Template untuk memberikan instruksi yang lebih baik
        prompt_template = """
Gunakan potongan konteks berikut untuk menjawab pertanyaan di akhir. Jawablah hanya berdasarkan konteks yang diberikan. Jika Anda tidak tahu jawabannya dari konteks, katakan saja bahwa Anda tidak dapat menemukan jawaban dalam dokumen yang disediakan. Jangan mencoba mengarang jawaban.

Konteks:
{context}

Pertanyaan: {question}
Jawaban:
"""
        PROMPT = PromptTemplate(
            template=prompt_template, input_variables=["context", "question"]
        )
        
        chain_type_kwargs = {"prompt": PROMPT}

        retriever = db.as_retriever()

        # Masukkan prompt kustom ke dalam QA Chain
        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=retriever,
            chain_type_kwargs=chain_type_kwargs,
            return_source_documents=True
        )
        
        return qa_chain
    except Exception as e:
        print("\n--- AN ERROR OCCURRED WHILE SETTING UP THE QA CHAIN ---")
        traceback.print_exc()
        return None


In [40]:
qa_chain = setup_qa_chain()

if qa_chain:
    print("\n" + "="*50)
    print("AI Question Answering System is Ready!")
    print("="*50 + "\n")
else:
    print("\nFAILED: The system could not be started. Please check the error messages above.")


--- Loading existing vector store ---
--- Vector store loaded successfully ---
--- Starting: Setting up LLM (this can take a long time on the first run) ---


Device set to use cpu


--- Finished: LLM setup complete ---

AI Question Answering System is Ready!



In [None]:
# Change the query below
query = ""

if qa_chain:
    try:
        print("... AI is processing your question ...")
        start_time = time.time()
        result = qa_chain({"query": query})
        end_time = time.time()
        
        print("\n" + "-"*20)
        print("Question:", query)
        print("\nAI Answer:")
        # Check if the result exists and is not empty
        if result and result.get("result"):
            print(result["result"].strip()) # Gunakan .strip() untuk menghapus spasi kosong
        else:
            print("The AI could not generate an answer. The received result was empty.")
            print("Raw result:", result)

        print(f"\n(Answer generated in {end_time - start_time:.2f} seconds)")
        
        print("\n--- Source Documents Used ---")
        # Use a set to ensure unique sources
        source_files = set()
        if result and result.get("source_documents"):
            for doc in result["source_documents"]:
                source_files.add(doc.metadata.get('source', 'N/A'))
        
        for file_path in source_files:
            print(f"-> File: {file_path}")

    except Exception as e:
        print("\n--- AN ERROR OCCURRED WHILE ANSWERING THE QUESTION ---")
        traceback.print_exc()
else:
    print("Error: The QA chain was not initialized. Cannot run the Q&A session.")


... AI is processing your question ...

--------------------
Question: What is islam about?

AI Answer:
[

(Answer generated in 65.13 seconds)

--- Source Documents Used ---
-> File: output\cleaned_text\معالم السنة النبوية -.txt
