In [22]:
import os
from typing import Any, List, Optional
from langchain_core.language_models.llms import LLM
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain_classic.retrievers import EnsembleRetriever, ContextualCompressionRetriever
from langchain_classic.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain_classic.chains import RetrievalQA
from langchain_classic.schema import Document
from byteplussdkarkruntime import Ark
from dotenv import load_dotenv
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
from langchain_astradb import AstraDBVectorStore

In [13]:
# ==========================================
# 1. WRAPPER BYTEPLUS
# ==========================================
load_dotenv()
api_key = os.getenv("ARK_API_KEY")
endpoint_id = os.getenv("MODEL_ENDPOINT_ID")

class BytePlusLLM(LLM):
    api_key: str
    endpoint_id: str
    client: Any = None

    def __init__(self, api_key, endpoint_id):
        super().__init__(api_key=api_key, endpoint_id=endpoint_id)
        self.client = Ark(
            api_key=api_key,
            base_url="https://ark.ap-southeast.bytepluses.com/api/v3"
        )

    @property
    def _llm_type(self) -> str:
        return "byteplus_ark"

    def _call(self, prompt: str, stop: Optional[List[str]] = None, **kwargs: Any) -> str:
        try:
            completion = self.client.chat.completions.create(
                model=self.endpoint_id,
                messages=[
                    {"role": "system", "content": "Jawab dengan singkat dan jelas berdasarkan konteks."},
                    {"role": "user", "content": prompt}
                ]
            )
            return completion.choices[0].message.content
        except Exception as e:
            return f"Error API: {e}"


In [14]:
open_ai_api_key = os.getenv("OPENAI_API_KEY")

In [17]:
def smartChunkProcessor(file_path, chunk_size=1000, chunk_overlap=200) -> List[Document]:
    """
    Split text into chunks with overlap for better context retention.
    """
    docs = PyMuPDFLoader(file_path)
    document = docs.load()

    ## text Splitter
    embeddings = OpenAIEmbeddings(api_key = open_ai_api_key, model="text-embedding-3-small")

    text_splitter = SemanticChunker(
        embeddings=embeddings,
        breakpoint_threshold_amount=0.9, # Example value
        min_chunk_size=100
    )

    chunks = text_splitter.split_documents(document)
    return chunks

In [18]:
chunks = smartChunkProcessor("../data/Aplikasi Web Question Answering Menggunakan Langchain OpenAI.pdf")

In [19]:
chunks

[Document(metadata={'producer': 'Microsoft® Word 2010', 'creator': 'Microsoft® Word 2010', 'creationdate': '2024-12-11T15:20:15+07:00', 'source': '../data/Aplikasi Web Question Answering Menggunakan Langchain OpenAI.pdf', 'file_path': '../data/Aplikasi Web Question Answering Menggunakan Langchain OpenAI.pdf', 'total_pages': 12, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-12-11T15:20:15+07:00', 'trapped': '', 'modDate': "D:20241211152015+07'00'", 'creationDate': "D:20241211152015+07'00'", 'page': 0}, page_content='Journal of Computer System and Informatics (JoSYC)  \nISSN 2714-8912 (media online), ISSN 2714-7150 (media cetak) \nVolume 6, No. 1, November 2024, Page 293-304 \nhttps://ejurnal.seminar-id.com/index.php/josyc \nDOI 10.47065/josyc.v6i1.6182 \n \nCopyright © 2024 Author, Page 293  \nThis Journal is licensed under a Creative Commons Attribution 4.0 International License \n Aplikasi Web Question Answering Menggunakan Langchain O

In [24]:
ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")

sparse_retriever = BM25Retriever.from_documents(chunks)
sparse_retriever.k = 5
embeddings = OpenAIEmbeddings(api_key = open_ai_api_key, model="text-embedding-3-small")

vectorstore = AstraDBVectorStore(
    embedding= embeddings,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    collection_name="ScholarSyncV1",
    namespace=None
)

dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

hybrid_retriever = EnsembleRetriever(
    retrievers=[sparse_retriever, dense_retriever],
    weights=[0.5, 0.5]
)

In [25]:
model_reranker = HuggingFaceCrossEncoder(model_name="cross-encoder/ms-marco-MiniLM-L-6-v2")

# Wrapper Reranker
compressor = CrossEncoderReranker(model=model_reranker, top_n=3) # Hanya ambil TOP 3 terbaik setelah reranking

# Final Retriever: Hybrid -> Reranking
final_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=hybrid_retriever
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [36]:
llm = BytePlusLLM(api_key=api_key, endpoint_id=endpoint_id)


In [41]:
qa_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a scholarly assistant. Answer the question based ONLY on the context provided. If you don't know, say so.\n\nContext:\n{context}"),
    ("user", "{input}")
])

In [42]:
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(final_retriever, question_answer_chain)


In [43]:
response = rag_chain.invoke({
    "input" : "bagaimana membuat chatbot dengan langchain"
})
response

{'input': 'bagaimana membuat chatbot dengan langchain',
 'context': [Document(metadata={'producer': 'Microsoft® Word 2010', 'creator': 'Microsoft® Word 2010', 'creationdate': '2024-12-11T15:20:15+07:00', 'source': '../data/Aplikasi Web Question Answering Menggunakan Langchain OpenAI.pdf', 'file_path': '../data/Aplikasi Web Question Answering Menggunakan Langchain OpenAI.pdf', 'total_pages': 12, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-12-11T15:20:15+07:00', 'trapped': '', 'modDate': "D:20241211152015+07'00'", 'creationDate': "D:20241211152015+07'00'", 'page': 1}, page_content='Penilitian ini juga melakukan \npendekatan seperti yang dilakukan oleh Arjun Pesaru, dkk yakni Pendekatan memanfaatkan kekuatan LangChain \ndan Model Large Language Models (LLM) untuk membuat chatbot yang dapat menjawab pertanyaan tentang \nberkas PDF [8].'),
  Document(metadata={'producer': 'Microsoft® Word 2010', 'creator': 'Microsoft® Word 2010', 'creation

In [44]:
response = rag_chain.invoke({
    "input" : "Bagaimana chatbot bisa menjawab pertanyaan tentang berkas pdf?"
})
response['answer']

'Berdasarkan konteks yang disediakan, chatbot dapat menjawab pertanyaan tentang berkas PDF dengan memanfaatkan kekuatan framework LangChain dan Model Large Language Models (LLM).'

In [None]:
vectorstore.