## Extract PDF

In [1]:
from typing import Iterator, Union
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument
from docling.document_converter import DocumentConverter

class DoclingPDFLoader(BaseLoader):

    def __init__(self, file_path: Union[str, list[str]]) -> None:
        self._file_paths = file_path if isinstance(file_path, list) else [file_path]
        self._converter = DocumentConverter()

    def lazy_load(self) -> Iterator[LCDocument]:
        for source in self._file_paths:
            dl_doc = self._converter.convert(source).document
            text = dl_doc.export_to_markdown()
            yield LCDocument(page_content=text)

In [2]:
path =r"C:\Users\sgsong\Desktop\BKMS2\test\A15-10-15.pdf"

### Load and Split

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = DoclingPDFLoader(file_path=path)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

In [4]:
docs = loader.load()
splits = text_splitter.split_documents(docs)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


## Embedding

In [5]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)



## VectorStore

In [6]:
from tempfile import TemporaryDirectory

from langchain_milvus import Milvus
import os
# MILVUS_URI = os.environ.get(
#     "MILVUS_URI", f"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db"
# )
MILVUS_URI = "tcp://127.0.0.1:19530"  # Milvus의 gRPC URI

vectorstore = Milvus.from_documents(
    splits,
    embeddings,
    connection_args={"uri": MILVUS_URI},
    drop_old=True,
)

## LLM Model

In [17]:
from langchain_huggingface import HuggingFaceEndpoint

# HF_API_KEY = os.environ.get("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")
HF_API_KEY = os.environ.get("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")
HF_LLM_MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

llm = HuggingFaceEndpoint(
    repo_id=HF_LLM_MODEL_ID,
    huggingfacehub_api_token=HF_API_KEY,
)

response = llm.invoke("How to navigator?")
print(response)

BadRequestError:  (Request ID: sDnvtsO74Ojy2sld9Bl0O)

Bad request:
Authorization header is correct, but the token seems invalid

## Output Prompt

In [None]:
from typing import Iterable

from langchain_core.documents import Document as LCDocument
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs: Iterable[LCDocument]):
    return "\n\n".join(doc.page_content for doc in docs)


retriever = vectorstore.as_retriever()

prompt = PromptTemplate.from_template(
    "Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {question}\nAnswer:\n"
)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

## Usage

In [None]:
rag_chain.invoke("various of Navigation?")

"Navigation on this device includes the following:\n\n- l Swipe the screen to unlock the device.\n- l Swipe the screen to scroll through the Home screens or menu options.\n- l Drag an app shortcut to add it to a Home screen.\n- l Drag a widget to place it in a new location.\n\nNavigation can also be achieved by using the navigation buttons. These buttons allow you to navigate back, home, and recent apps.\n\nNote: The answer is based on the given context information and may not be exhaustive or up-to-date. It is recommended to refer to the official user manual or manufacturer's documentation for more information."

In [None]:
rag_chain.invoke("What's name of this phone?")

"There is no information provided about the name of this phone. The provided text seems to be a user manual or tutorial for a Samsung phone, but it doesn't mention the specific model or name of the phone. Therefore, it's impossible to answer the query without more information.  # Samsung # Phone # User Manual # Tutorial # Getting Started # Navigation # Touch Screen # Gesture # Swipe # Drag and Drop # Zoom # Touch and Hold # Navigation Bar # Side Button # Accounts # Google Account # Samsung Account # Outlook # Voicemail # Getting Started # Navigation Bar # Side Button # Accounts # Google Account # Samsung Account # Outlook # Voicemail # Getting Started # Navigation Bar # Side Button # Accounts # Google Account # Samsung Account # Outlook # Voicemail # Getting Started # Navigation Bar # Side Button # Accounts # Google Account # Samsung Account # Outlook # Voicemail # Getting Started # Navigation Bar # Side Button # Accounts # Google Account # Samsung Account # Outlook # Voicemail # Getti

In [None]:
import os

def _load_files_from_directories(directories) -> list[str]:
        """
        Read all files from the specified directories.

        Args:
            directories (list[str]): A list of folder paths to search for files.

        Returns:
            list[str]: A list of file paths found in the given directories.
        """
        
        all_files = []
        for directory in directories:
            print(f"Loading files from {directory}")
            for root, _, files in os.walk(directory):
                for file in files:
                    all_files.append(os.path.join(root, file))
                    print(f"{file} added")
        return all_files

# Directories to load files from
directories = ["/Users/jyp/Documents/GitHub/BKMS2/apple", "/Users/jyp/Documents/GitHub/BKMS2/samsung"]
files = _load_files_from_directories(directories)



Loading files from /Users/jyp/Documents/GitHub/BKMS2/apple
Loading files from /Users/jyp/Documents/GitHub/BKMS2/samsung
