In [1]:
import os
from langchain_groq import ChatGroq #model llm, bisa open ai, gemini, bison, open llama, groq, nvidia. kl rag, disarankan pake API yg open source
from langchain_community.embeddings import HuggingFaceEmbeddings #pake embedding, membuat vektor yg akan disimpan dlm vector db
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain #u/ processing text nya, next level ada namanya textsplitter, etc
from langchain_core.prompts import ChatPromptTemplate #prompt template krn kasus spesifik
from langchain.chains import create_retrieval_chain #chain retrieval , dari user input sampai jd output
from langchain_community.vectorstores import FAISS #chromadb, astradb, cosmosdb, FAISS (vector db)
from langchain_community.document_loaders import PyPDFDirectoryLoader #karena datanya pdf yg akan diload
# from sentence_transformers import SentenceTransformer
# from langchain_huggingface import HuggingFaceEmbeddings


import time
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

True

In [2]:
# inisialisasi model
def init_groq():
    groq_api_key = os.getenv("GROQ_API_KEY")
    if not groq_api_key:
        print("❌ GROQ_API_KEY not found!")
        return None
    
    try:
        return ChatGroq(
            groq_api_key=groq_api_key,
            model_name="mixtral-8x7b-32768",
            temperature=0,
            max_tokens=4000 #pembatasan karakter yg akan dijadikan jawaban 4000 * 0.005*30*24 (biayanya)
        )
    except Exception as e:
        print(f"Error initializing Groq: {str(e)}")
        return None

# Initialize Groq
llm = init_groq()

In [None]:
def create_vector_embedding():
    try:
        print("Creating vector embeddings...")
        
        # periksa apakah directory sudah ada ?
        if not os.path.exists("kuhp"): #nama folder
            os.makedirs("kuhp",exist_ok=True)
            print("📁 Created 'kuhp' directory. Please add your PDF files there.")
            return None, None
        
        # Initialize embeddings
        embeddings = HuggingFaceEmbeddings(
            model_name= "all-MiniLM-L6-v2",
            model_kwargs={'device': 'cpu'}, #kl punya gpu, pake gpu
            encode_kwargs={
                'normalize_embeddings': True, #dijadikan 0-1
                'batch_size': 32 #per batch akan memasukkan 32 data (bisa di tunning/diubah2)
            }
        )
        
        # Load documents
        loader = PyPDFDirectoryLoader("kuhp")
        docs = loader.load()
        
        if not docs:
            print("❌ No documents found in directory!")
            return None, None
        
        print(f"Found {len(docs)} documents")
        
        # Split documents
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, #1000 karakter. chunk kl misal di doc tersbut ada kata "saya suka makan nasi goreng di psar minggu" berarti itu 1 chunk
            chunk_overlap=100 #100 karakter sebelumnya. 
        ) #chunk untuk tunning. 
        final_documents = text_splitter.split_documents(docs) #load data ke dalam vektor db
        
        print(f"Created {len(final_documents)} document chunks")
        
        # membuat vector store, dimasukkan ke dalam vectornya. vectornya kita pake faiss
        vectors = FAISS.from_documents(
            final_documents,
            embeddings
        )
        
        print("✅ Vector store created successfully!")
        return vectors, docs
        
    except Exception as e:
        print(f"Error during embedding creation: {str(e)}")
        return None, None

In [4]:
# Setup Prompt
prompt = ChatPromptTemplate.from_template("""
You are a helpful assistant for analyzing legal documents in PDF format. Answer the questions based solely on the provided context.
Focus on providing accurate, clear, and detailed responses specific to Indonesian legal terminology and structure.
Additionally, if a part of the document is ambiguous, explain your reasoning for any assumptions made.

<context>
{context}
</context>

Question: {input}
""")

In [5]:
def process_query(user_prompt, vectors, llm, prompt):
    # periksa apakah vector dan llm sudah ada
    if vectors is None:
        print("❌ Vector store not initialized! Please create embeddings first.")
        return None
    
    if llm is None:
        print("❌ Language model not initialized! Please check your GROQ_API_KEY.")
        return None
        
    try:
        # Create chains
        document_chain = create_stuff_documents_chain(llm, prompt)
        retriever = vectors.as_retriever(
            search_kwargs={"k": 10} #ambil 10 yang similar
        )
        retrieval_chain = create_retrieval_chain(retriever, document_chain)
        
        # Get response
        start = time.process_time() #u/ itung waktu
        response = retrieval_chain.invoke({'input': user_prompt})
        response_time = time.process_time() - start
        
        print(f"⏱️ Response time: {response_time:.2f} seconds")
        print("\nAnswer:")
        print(response['answer'])
        
        print("\nRelated Document Excerpts:")
        for i, doc in enumerate(response['context']):
            print(f"\nDocument {i+1}:")
            print(doc.page_content)
            if hasattr(doc, 'metadata') and doc.metadata:
                print(f"Source: {doc.metadata.get('source', 'Unknown')}") #kl tidak match maka unknown, chatbot akan ngomong dia tdk paham dgn yg akan dimaksud
            print("-" * 50)
            
        return response
        
    except Exception as e:
        print(f"Error processing query: {str(e)}")
        return None

In [6]:
vectors, docs = create_vector_embedding()

# response = process_query(user_prompt, vectors, llm, prompt)

Creating vector embeddings...
Error during embedding creation: name 'SentenceTransformer' is not defined


In [7]:
user_prompt = "JELASKAN BAB 1 RUANG LINGKUP BERLAKUNYA KETENTUAN PERATURAN PERUNDANG UNDANGAN PIDANA Bagian Kesatu Menurut Waktu pada bagian pasal 1, jelaskan point pointnya"
response = process_query(user_prompt, vectors, llm, prompt)

❌ Vector store not initialized! Please create embeddings first.
