# 1. Import libraries

In [58]:
from langchain_unstructured import UnstructuredLoader
from unstructured.cleaners.core import clean_extra_whitespace
import uuid
from langchain_text_splitters import RecursiveCharacterTextSplitter

### test boto3 load unstructred

file s3 -> download temorary file -> load w/ unstructured

In [3]:
a = '{"S3_KEY": "key", "fileID": "file_id"}'
import json
data = json.loads(a)
print(data)
print(type(data))
print("hello")

{'S3_KEY': 'key', 'fileID': 'file_id'}
<class 'dict'>
hello


In [2]:
from dataclasses import dataclass

@dataclass
class MessagePayload:
    storage_key: str
    file_id: str 

b = '{"storage_key": "123456", "file_id": "default_file_id"}'
payload = json.loads(b, object_hook=lambda d: MessagePayload(**d))
print(payload)
print(type(payload))

MessagePayload(storage_key='123456', file_id='default_file_id')
<class '__main__.MessagePayload'>


# 2. Load, Modify and Split documents

In [59]:
file_paths = [
    "Data/AttentionIsAllYouNeed.pdf",
]
loader = UnstructuredLoader(file_paths, chunking_strategy="by_title", strategy="auto")
docs = loader.load()
print(f"Loaded {len(docs)} documents.")

Loaded 112 documents.


In [60]:
UNNESSECARIES = ["coordinates", "file_directory", "filename", "languages", "last_modified", "filetype", "category", "element_id"]
for doc in docs:
    doc.metadata["uuid"] = uuid.uuid4().__str__()
    for key in UNNESSECARIES:
        if key in doc.metadata:
            del doc.metadata[key]
print(f"Processed {len(docs)} documents from {docs[0].metadata['source']}.")

Processed 112 documents from Data/AttentionIsAllYouNeed.pdf.


In [61]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100, strip_whitespace=True)
texts = text_splitter.split_documents(docs)
print(f"Splitted into {len(texts)} chunks.")

Splitted into 112 chunks.


In [62]:
print(type(texts[0]))

<class 'langchain_core.documents.base.Document'>


In [63]:
with open("texts.txt", "w", encoding="utf-8") as f:
    for i in texts:
        f.write(f"{i.metadata}\n")
        f.write(f"{i.page_content}\n")
        f.write("\n" + "-" * 80 + "\n\n")

In [64]:
print(len(texts))
print(len(docs))

112
112


# 3. Upsert Vector Database

### Qdrant Cloud and Ollama

In [12]:
from qdrant_client import QdrantClient, models
from langchain_ollama import OllamaEmbeddings

# Cấu hình embedding bằng Ollama (mô hình BGE-M3)
embedding_model = OllamaEmbeddings(model="bge-m3")

# Danh sách văn bản và metadata kèm theo
texts = [
    "Qdrant has Langchain integrations",
    "Qdrant also has Llama Index integrations"
]
payloads = [
    {"source": "Langchain-docs"},
    {"source": "LlamaIndex-docs"}
]
ids = [42, 2]

# Encode văn bản thành vector bằng embedding từ Ollama
vectors = embedding_model.embed_documents(texts)

# Kết nối tới Qdrant Cloud
client = QdrantClient(
    url="https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.n5cMRHuvjC9m7W1CwO3wZA-3kOH4os-J21SrhSeW7JQ",
    prefer_grpc=False,
)

# Tạo collection nếu chưa có
client.recreate_collection(  # Sử dụng recreate để đảm bảo làm mới
    collection_name="my_collection",
    vectors_config=models.VectorParams(
        size=len(vectors[0]),  # Tự động lấy dimension từ vector
        distance=models.Distance.COSINE
    )
)

# Upload vector + payload
client.upsert(
    collection_name="my_collection",
    points=[
        models.PointStruct(
            id=ids[i],
            vector=vectors[i],
            payload={"text": texts[i], **payloads[i]}
        ) for i in range(len(texts))
    ]
)

# Truy vấn tìm kiếm
query = "integration with LangChain"
query_vector = embedding_model.embed_query(query)

search_result = client.search(
    collection_name="my_collection",
    query_vector=query_vector,
    limit=2
)

# In kết quả
for i, hit in enumerate(search_result):
    print(f"\n📌 Kết quả {i + 1}:")
    print("Score:", hit.score)
    print("Payload:", hit.payload)


  client.recreate_collection(  # Sử dụng recreate để đảm bảo làm mới
  search_result = client.search(



📌 Kết quả 1:
Score: 0.73782766
Payload: {'text': 'Qdrant has Langchain integrations', 'source': 'Langchain-docs'}

📌 Kết quả 2:
Score: 0.55920184
Payload: {'text': 'Qdrant also has Llama Index integrations', 'source': 'LlamaIndex-docs'}


In [19]:
#Print vector shape
print(f"Vector shape: {len(vectors[2])}")
print(f"Number of vectors: {len(vectors)}")

IndexError: list index out of range

In [None]:
from langchain_qdrant import QdrantVectorStore
from langchain_ollama import OllamaEmbeddings
import config


def init_qdrant(texts=None,
        collection_name="DocumentsControl3", 
        embeddings=OllamaEmbeddings(model="bge-m3:latest"), 
        url=config.url, 
        api_key = config.api_key
    ):
    """Initialize Qdrant Vector Store with existing collection or create a new one.
    Args:
        collection_name (str): Name of the Qdrant collection.
        embeddings (OllamaEmbeddings): Embedding model to use.
        url (str): URL of the Qdrant instance.
        api_key (str): API key for Qdrant access.
        
    Returns:
        QdrantVectorStore: Initialized Qdrant vector store."""
    if texts==None:
        qdrant = QdrantVectorStore.from_existing_collection(embedding=embeddings,
                                                            url=url,
                                                            prefer_grpc=False,
                                                            api_key=api_key,
                                                            collection_name=collection_name)
        print(f"Existed Collection: **{collection_name}** Initialized Successfully.")
    else:
        qdrant = QdrantVectorStore.from_documents(texts=texts,
                                                  embeddings=embeddings,
                                                  url=url,
                                                  prefer_grpc=False,
                                                  api_key=api_key,
                                                  collection_name=collection_name)
        print(f"Created **{collection_name}** Successfully.")
    return qdrant
qdrant = init_qdrant()

INFO: HTTP Request: GET https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333 "HTTP/1.1 200 OK"
INFO: HTTP Request: GET https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3 "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


Existed Collection: **DocumentsControl3** Initialized Successfully.


### Quẻry

In [66]:
results = qdrant.similarity_search("what is self-attention", k=2)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3/points/query "HTTP/1.1 200 OK"


* Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 27, 28, 22]. [{'links': [{'text': '4', 'url': 'cite.cheng2016long', 'start_index': 378}, {'text': '27', 'url': 'cite.decomposableAttnModel', 'start_index': 381}, {'url': 'cite.paulus2017deep', 'text': '28', 'start_index': 385}, {'text': '22', 'start_index': 389, 'url': 'cite.lin2017structured'}], 'page_number': 2, 'uuid': '592032b9-34bf-4f94-a598-ab65f2e25d4e', 'source': 'D:\\Projects\\documentsControl\\Data\\AttentionIsAllYouNeed.pdf', 'parent_id': '95b94863f46fadf14bb4b905d63fa743', '_id': '1af026d1-c002-4af0-85f1-8c90ab4bb0fd', '_collection_name': 'DocumentsControl3'}]
* 4 Why Self-Attention [{'source': 'D:

In [None]:
retriever = qdrant.as_retriever(search_type="mmr", search_kwargs={"k": 5})
a = retriever.invoke("what is self-attention")

INFO: HTTP Request: GET https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3 "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3/points/query "HTTP/1.1 200 OK"


In [68]:
a

[Document(metadata={'links': [{'text': '4', 'url': 'cite.cheng2016long', 'start_index': 378}, {'text': '27', 'url': 'cite.decomposableAttnModel', 'start_index': 381}, {'url': 'cite.paulus2017deep', 'text': '28', 'start_index': 385}, {'text': '22', 'start_index': 389, 'url': 'cite.lin2017structured'}], 'page_number': 2, 'uuid': '592032b9-34bf-4f94-a598-ab65f2e25d4e', 'source': 'D:\\Projects\\documentsControl\\Data\\AttentionIsAllYouNeed.pdf', 'parent_id': '95b94863f46fadf14bb4b905d63fa743', '_id': '1af026d1-c002-4af0-85f1-8c90ab4bb0fd', '_collection_name': 'DocumentsControl3'}, page_content='Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 27, 28, 22].'),
 Docu

# 4. Return

In [69]:
ret = retriever.invoke("what is self-attention")

INFO: HTTP Request: GET https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3 "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3/points/query "HTTP/1.1 200 OK"


In [70]:
for i in ret:
    print(f"* [{i.metadata}] \n {i.page_content} \n\n")

* [{'links': [{'text': '4', 'url': 'cite.cheng2016long', 'start_index': 378}, {'text': '27', 'url': 'cite.decomposableAttnModel', 'start_index': 381}, {'url': 'cite.paulus2017deep', 'text': '28', 'start_index': 385}, {'text': '22', 'start_index': 389, 'url': 'cite.lin2017structured'}], 'page_number': 2, 'uuid': '592032b9-34bf-4f94-a598-ab65f2e25d4e', 'source': 'D:\\Projects\\documentsControl\\Data\\AttentionIsAllYouNeed.pdf', 'parent_id': '95b94863f46fadf14bb4b905d63fa743', '_id': '1af026d1-c002-4af0-85f1-8c90ab4bb0fd', '_collection_name': 'DocumentsControl3'}] 
 Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 27, 28, 22]. 


* [{'source': 'D:\\Projects\\docu

# 5. Chat 

In [None]:
from google import genai
from google.genai import types
client = genai.Client(api_key="")

response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents="Explain how AI works in a few words",
    config=types.GenerateContentConfig(
        thinking_config=types.ThinkingConfig(thinking_budget=0) # Disables thinking
    ),
)
print(response.text)

TypeError: 'Models' object is not callable

In [71]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

template = """Answer the user's question based on the provided context.

-----------
{{context}}
-----------

Question: {question}

Remember: 
- If the question is not related to the context, you should answer "Hmmm... I'm not sure".
"""

prompt = ChatPromptTemplate.from_template(template)

model = OllamaLLM(model="qwen2.5:1.5B")

chain = prompt | model

chain.invoke({"question": "Lang Chain là gì"})

INFO: HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


'LangChain là một mô hình mạng lưới của các mô hình ngôn ngữ được xây dựng để chia sẻ và chia sẻ thông tin một cách hiệu quả.'

In [5]:
from langchain_core.prompts import ChatPromptTemplate
# Thay thế OllamaLLM bằng ChatGoogleGenerativeAI
from langchain_google_genai import ChatGoogleGenerativeAI
import os

# Đặt API Key của bạn vào biến môi trường
# Hoặc cấu hình genai.configure() trước khi chạy
os.environ["GOOGLE_API_KEY"] = "AIzaSyCnc1PCwmvB2NPkULOcizOpzD2Qqaj1MmI" 

template = """Trả lời câu hỏi của người dùng dựa trên ngữ cảnh được cung cấp.

-----------
{context}
-----------

Câu hỏi: {question}

Lưu ý: 
- Nếu câu hỏi không liên quan đến ngữ cảnh, bạn nên trả lời "Hmmm... Tôi không chắc chắn".
"""

prompt = ChatPromptTemplate.from_template(template)

# Sử dụng ChatGoogleGenerativeAI thay vì OllamaLLM
# Bạn có thể chỉ định model là "gemini-pro" hoặc "gemini-pro-vision" hoặc "gemini-flash"
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash") 

# Nếu bạn muốn truyền context rỗng (như ví dụ gốc của bạn không có context)
# bạn có thể tạo một chuỗi đơn giản
chain = prompt | model

# Tạo một context mẫu để minh họa
context_data = """LangChain là một framework mã nguồn mở được thiết kế để giúp các nhà phát triển xây dựng các ứng dụng dựa trên mô hình ngôn ngữ lớn (LLM). Nó cung cấp các công cụ để kết nối LLM với các nguồn dữ liệu bên ngoài, cho phép tạo ra các ứng dụng mạnh mẽ và có khả năng tương tác cao. LangChain hỗ trợ nhiều LLM khác nhau và cung cấp các thành phần để xây dựng chuỗi logic phức tạp."""

response = chain.invoke({"question": "LangChain là gì?", "context": context_data})

print(response.content)

# Ví dụ với câu hỏi không liên quan đến context
response_irrelevant = chain.invoke({"question": "Thủ đô của Pháp là gì?", "context": context_data})
print(response_irrelevant.content)

LangChain là một framework mã nguồn mở được thiết kế để giúp các nhà phát triển xây dựng các ứng dụng dựa trên mô hình ngôn ngữ lớn (LLM).
Hmmm... Tôi không chắc chắn.


# 6. Boto3


/home/hung/projects/myenv/lib/python3.10/site-packages/langchain_community/document_loaders/s3_file.py

In [None]:
from langchain_community.document_loaders import S3FileLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
#Load documents from S3 bucket
import uuid
print("Loading documents from S3...")

UUID1 = uuid.uuid4().__str__()
loader = S3FileLoader(bucket="bibox-bucket", 
                        key="123456", 
                        id=UUID1,
                        aws_access_key_id="test", 
                        aws_secret_access_key="test", 
                        endpoint_url="http://localhost:4566",
                        chunking_strategy="by_title", strategy="auto",
                        chunk_size=100, chunk_overlap=200
)

documents = loader.load_and_split()
#add uuid to each document metadata



print("-"*80)
print(f"Loaded {len(documents)} documents from S3.")
# Print the first document's content
print("-"*80)
print("First document content:", documents[0].page_content[:100])  # Print first 100 characters





Loading documents from S3...
--------------------------------------------------------------------------------
Loaded 2 documents from S3.
--------------------------------------------------------------------------------
First document content: Tuyệt vời! Đoạn output bạn cung cấp cho thấy UnstructuredLoader đã hoạt động thành công và trích xuấ


In [73]:
for doc in documents:
    print(doc.metadata)


{'source': 's3://bibox-bucket/123456'}
{'source': 's3://bibox-bucket/123456'}


In [74]:
import tempfile  
import os
import boto3 
from langchain_unstructured import UnstructuredLoader
s3 = boto3.client('s3',
                    region_name='us-east-1',
                    aws_access_key_id='test',
                    aws_secret_access_key="test",
                    endpoint_url="http://localhost:4566")

bucket_name = 'bibox-bucket'
s3_key = '123456'

with tempfile.TemporaryDirectory() as temp_dir:
    file_path = f"{temp_dir}/{s3_key}"
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    s3.download_file(bucket_name, s3_key, file_path)
    print(f"Downloaded to {file_path}")
    
    loader = UnstructuredLoader(file_path, chunking_strategy="by_title", strategy="auto")
    docs = loader.load()
    print(f"Loaded {len(docs)} documents.")
    print(f"First document:\n{docs[0].metadata},\n, {docs[0].page_content[:100]}...")
    #only keep "source" key in metadata, delete others, add UUID
    UUID1 = uuid.uuid4().__str__()
    for doc in docs:
        keys_to_remove = [key for key in doc.metadata if key != "source"]
        for key in keys_to_remove:
            del doc.metadata[key]
        doc.metadata["uuid"] = UUID1
    print("\n\n", "-"*80)
    print(f"First document:\n{docs[0].metadata},\n, {docs[0].page_content[:100]}...")
    


Downloaded to /tmp/tmppbza86bw/123456
Loaded 13 documents.
First document:
{'source': '/tmp/tmppbza86bw/123456', 'file_directory': '/tmp/tmppbza86bw', 'filename': '123456', 'languages': ['eng'], 'last_modified': '2025-07-09T16:11:41', 'page_number': 1, 'orig_elements': 'eJzVV02P2zYQ/SusLkkB0+U3JV8TFCgQBAG6yWWzMPhpCbAlQ6Y26wT97yUp2WsnbrMXB/DBMt9wSFHz3gzJ+2+FW7uNa8OyscUCFBIpgpmxkHmD4kN6WGmJIEKa48qSUpqqmIFi44KyKqg45lthuq63TauC22W8VvtuCMvaNas6RIusyBzFQZP9S2NDHc0Cj+Zt17Qhjby/l9ki2Zw9zMCEMKJzmiCn5Vz8iLN3xMVuvwtuk77iQ/Pk1n9vlXHFP7HDN2u3tE3vTOj6fXL4I2y26bfVX1Up9Jdi8mrVxqV+TCjjoshrbleDWuUvuy9cuyoesnUXlpvONr5xOW4EEQ6RhKi6w2KB8YLhNHobRy7bYaNdH71wWkxwTykmxd2w/zxgZ2QAj6lhbfMb+DwgjFGXsMItiNHaDgHoAzZDuwImI74Fpu5AqEe0Bx/bXegHE4be2Xedsq4fZ8PxiRwF9TRrOJrjO6s4X5oieqC2TlMj5Fk0Po42EPrcsKYGT8P4qgDacWwDbFpPSMjp8d/ILfjw9s9xkU6q49rnKRxhv83h/diaKJZV1zdfnb1LAYmR+V6KpdFYSqUgto5Dxh2ClRUSOkZxSTEWiomrSzG+ZhTbBEk5Qsr5nF/A2f32xPgms4zNqTZwkyhFWKPMJY1Uv+3MkBh6Vtax14CmBb2agU0eLJufTGWycF2lFqfCuGvC2l0Sg5KcoigHKKPwIKOOQlUqD

In [75]:
from pathlib import Path
import tempfile
import boto3
import uuid
from langchain_unstructured import UnstructuredLoader
from typing import Any, Callable, Optional, Dict

class S3DocumentLoader:
    def __init__(
        self, 
        bucket: str, 
        key: str, 
        *,

        ## Optional parameters for S3 connection and document processing
        region_name: Optional[str] = "us-east-1",
        aws_access_key_id: Optional[str] = "test",
        aws_secret_access_key: Optional[str] = "test",
        endpoint_url: str = "http://localhost:4566",

        ## Parameters for document loading and splitting
        chunking_strategy: Optional[str] = "by_title",
        strategy: Optional[str] = "auto",
        max_characters: Optional[int] = 1000,
        overlap: Optional[int] = 200,

        ## Optional function to add custom metadata to documents
        # The function should accept a dictionary and return a dictionary
        # This allows for flexible metadata handling, such as adding custom keys or values
        # If None, no additional metadata will be added
        add_to_metadata: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None
        ):

        """Initialize the S3DocumentLoader with bucket and key."""

        self.bucket = bucket
        self.key = key
        self.uuid = str(uuid.uuid4())
        self.region_name = region_name
        self.aws_access_key_id = aws_access_key_id
        self.aws_secret_access_key = aws_secret_access_key
        self.endpoint_url = endpoint_url
        self.chunking_strategy = chunking_strategy
        self.strategy = strategy
        self.max_characters = max_characters
        self.overlap = overlap
        self.add_to_metadata = add_to_metadata
        self.s3 = boto3.client(
            "s3",
            region_name=self.region_name,
            aws_access_key_id=self.aws_access_key_id,
            aws_secret_access_key=self.aws_secret_access_key,
            endpoint_url=endpoint_url,
        )
        self.docs = []

    def download(self, temp_dir: str):
        self.local_path = Path(temp_dir) / Path(self.key).name
        self.local_path.parent.mkdir(parents=True, exist_ok=True)
        self.s3.download_file(self.bucket, self.key, str(self.local_path))
        print(f"✅ Downloaded: s3://{self.bucket}/{self.key} → {self.local_path}")

    def load_and_split(self):
        loader = UnstructuredLoader(
            str(self.local_path),
            chunking_strategy=self.chunking_strategy,
            strategy=self.strategy,
            max_characters=self.max_characters, overlap=self.overlap, 
        )
        self.docs = loader.load()

        print(f"✅ Loaded {len(self.docs)} documents.")
        

    def clean_metadata(self):
        for doc in self.docs:
            doc.metadata = {
                "uuid": self.uuid,
                "source": doc.metadata.get("source", f"s3://{self.bucket}/{self.key}"),
            }

            if self.add_to_metadata:
                custom_metadata = self.add_to_metadata(doc.metadata.copy())
                if custom_metadata and isinstance(custom_metadata, dict):
                    doc.metadata.update(custom_metadata)
                else:
                    print(f"⚠️ Warning: add_to_metadata did not return a dictionary or returned None for doc {doc.metadata.get('source')}.")

    def preview(self, n: int = 1):
        for doc in self.docs[:n]:
            print("-" * 80)
            print("📄 Metadata:", doc.metadata)
            print("📄 Content preview:", doc.page_content[:100], "...\n")

    def run(self):
        with tempfile.TemporaryDirectory() as tmp:
            self.download(tmp)
            self.load_and_split()
            self.clean_metadata()
            self.preview(3)


# 👇 Sử dụng class
if __name__ == "__main__":
    loader = S3DocumentLoader(bucket="bibox-bucket", key="123456", add_to_metadata=lambda metadata: {"custom_key": "custom_value"})
    loader.run()


✅ Downloaded: s3://bibox-bucket/123456 → /tmp/tmpw_ooo9zk/123456
✅ Loaded 7 documents.
--------------------------------------------------------------------------------
📄 Metadata: {'uuid': '5c58fe2a-03be-41a3-b8de-6b7be080dd98', 'source': '/tmp/tmpw_ooo9zk/123456', 'custom_key': 'custom_value'}
📄 Content preview: Tuyệt vời! Đoạn output bạn cung cấp cho thấy UnstructuredLoader đã hoạt động thành công và trích xuấ ...

--------------------------------------------------------------------------------
📄 Metadata: {'uuid': '5c58fe2a-03be-41a3-b8de-6b7be080dd98', 'source': '/tmp/tmpw_ooo9zk/123456', 'custom_key': 'custom_value'}
📄 Content preview: o

links: (Trong một số trường hợp) Các liên kết được tìm thấy trong văn bản.

page_content: Nội dun ...

--------------------------------------------------------------------------------
📄 Metadata: {'uuid': '5c58fe2a-03be-41a3-b8de-6b7be080dd98', 'source': '/tmp/tmpw_ooo9zk/123456', 'custom_key': 'custom_value'}
📄 Content preview: được trích xuất c

In [76]:
loader.docs

[Document(metadata={'uuid': '5c58fe2a-03be-41a3-b8de-6b7be080dd98', 'source': '/tmp/tmpw_ooo9zk/123456', 'custom_key': 'custom_value'}, page_content="Tuyệt vời! Đoạn output bạn cung cấp cho thấy UnstructuredLoader đã hoạt động thành công và trích xuất nội dung từ tệp PDF của bạn.\n\nCác đối tượng Document được in ra, mỗi đối tượng chứa:\n\nmetadata: Chứa thông tin chi tiết về đoạn văn bản được trích xuất, bao gồm:\n\no source: Đường dẫn đến tệp PDF.\n\no coordinates: Vị trí của đoạn văn bản trên trang (trong hệ thống PixelSpace).\n\no file_directory, filename: Thông tin về tệp.\n\no\n\nlanguages: Ngôn ngữ được phát hiện (ở đây là 'eng').\n\no\n\nlast_modified: Thời gian sửa đổi lần cuối của tệp.\n\no page_number: Số trang mà đoạn văn bản được trích xuất từ đó.\n\no filetype: Loại tệp (application/pdf).\n\no category: Loại phần tử được unstructured phân loại (ví dụ: UncategorizedText, NarrativeText, Title, Footer).\n\no element_id: Một ID duy nhất cho phần tử.\n\no parent_id: Nếu là một