# 1. Import libraries

In [58]:
from langchain_unstructured import UnstructuredLoader
from unstructured.cleaners.core import clean_extra_whitespace
import uuid
from langchain_text_splitters import RecursiveCharacterTextSplitter

### test boto3 load unstructred

file s3 -> download temorary file -> load w/ unstructured

In [3]:
a = '{"S3_KEY": "key", "fileID": "file_id"}'
import json
data = json.loads(a)
print(data)
print(type(data))
print("hello")

{'S3_KEY': 'key', 'fileID': 'file_id'}
<class 'dict'>
hello


In [2]:
from dataclasses import dataclass

@dataclass
class MessagePayload:
    storage_key: str
    file_id: str 

b = '{"storage_key": "123456", "file_id": "default_file_id"}'
payload = json.loads(b, object_hook=lambda d: MessagePayload(**d))
print(payload)
print(type(payload))

MessagePayload(storage_key='123456', file_id='default_file_id')
<class '__main__.MessagePayload'>


# 2. Load, Modify and Split documents

In [59]:
file_paths = [
    "Data/AttentionIsAllYouNeed.pdf",
]
loader = UnstructuredLoader(file_paths, chunking_strategy="by_title", strategy="auto")
docs = loader.load()
print(f"Loaded {len(docs)} documents.")

Loaded 112 documents.


In [60]:
UNNESSECARIES = ["coordinates", "file_directory", "filename", "languages", "last_modified", "filetype", "category", "element_id"]
for doc in docs:
    doc.metadata["uuid"] = uuid.uuid4().__str__()
    for key in UNNESSECARIES:
        if key in doc.metadata:
            del doc.metadata[key]
print(f"Processed {len(docs)} documents from {docs[0].metadata['source']}.")

Processed 112 documents from Data/AttentionIsAllYouNeed.pdf.


In [61]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100, strip_whitespace=True)
texts = text_splitter.split_documents(docs)
print(f"Splitted into {len(texts)} chunks.")

Splitted into 112 chunks.


In [62]:
print(type(texts[0]))

<class 'langchain_core.documents.base.Document'>


In [63]:
with open("texts.txt", "w", encoding="utf-8") as f:
    for i in texts:
        f.write(f"{i.metadata}\n")
        f.write(f"{i.page_content}\n")
        f.write("\n" + "-" * 80 + "\n\n")

In [64]:
print(len(texts))
print(len(docs))

112
112


# 3. Upsert Vector Database

### Qdrant Cloud and Ollama

In [12]:
from qdrant_client import QdrantClient, models
from langchain_ollama import OllamaEmbeddings

# C·∫•u h√¨nh embedding b·∫±ng Ollama (m√¥ h√¨nh BGE-M3)
embedding_model = OllamaEmbeddings(model="bge-m3")

# Danh s√°ch vƒÉn b·∫£n v√† metadata k√®m theo
texts = [
    "Qdrant has Langchain integrations",
    "Qdrant also has Llama Index integrations"
]
payloads = [
    {"source": "Langchain-docs"},
    {"source": "LlamaIndex-docs"}
]
ids = [42, 2]

# Encode vƒÉn b·∫£n th√†nh vector b·∫±ng embedding t·ª´ Ollama
vectors = embedding_model.embed_documents(texts)

# K·∫øt n·ªëi t·ªõi Qdrant Cloud
client = QdrantClient(
    url="https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.n5cMRHuvjC9m7W1CwO3wZA-3kOH4os-J21SrhSeW7JQ",
    prefer_grpc=False,
)

# T·∫°o collection n·∫øu ch∆∞a c√≥
client.recreate_collection(  # S·ª≠ d·ª•ng recreate ƒë·ªÉ ƒë·∫£m b·∫£o l√†m m·ªõi
    collection_name="my_collection",
    vectors_config=models.VectorParams(
        size=len(vectors[0]),  # T·ª± ƒë·ªông l·∫•y dimension t·ª´ vector
        distance=models.Distance.COSINE
    )
)

# Upload vector + payload
client.upsert(
    collection_name="my_collection",
    points=[
        models.PointStruct(
            id=ids[i],
            vector=vectors[i],
            payload={"text": texts[i], **payloads[i]}
        ) for i in range(len(texts))
    ]
)

# Truy v·∫•n t√¨m ki·∫øm
query = "integration with LangChain"
query_vector = embedding_model.embed_query(query)

search_result = client.search(
    collection_name="my_collection",
    query_vector=query_vector,
    limit=2
)

# In k·∫øt qu·∫£
for i, hit in enumerate(search_result):
    print(f"\nüìå K·∫øt qu·∫£ {i + 1}:")
    print("Score:", hit.score)
    print("Payload:", hit.payload)


  client.recreate_collection(  # S·ª≠ d·ª•ng recreate ƒë·ªÉ ƒë·∫£m b·∫£o l√†m m·ªõi
  search_result = client.search(



üìå K·∫øt qu·∫£ 1:
Score: 0.73782766
Payload: {'text': 'Qdrant has Langchain integrations', 'source': 'Langchain-docs'}

üìå K·∫øt qu·∫£ 2:
Score: 0.55920184
Payload: {'text': 'Qdrant also has Llama Index integrations', 'source': 'LlamaIndex-docs'}


In [19]:
#Print vector shape
print(f"Vector shape: {len(vectors[2])}")
print(f"Number of vectors: {len(vectors)}")

IndexError: list index out of range

In [None]:
from langchain_qdrant import QdrantVectorStore
from langchain_ollama import OllamaEmbeddings
import config


def init_qdrant(texts=None,
        collection_name="DocumentsControl3", 
        embeddings=OllamaEmbeddings(model="bge-m3:latest"), 
        url=config.url, 
        api_key = config.api_key
    ):
    """Initialize Qdrant Vector Store with existing collection or create a new one.
    Args:
        collection_name (str): Name of the Qdrant collection.
        embeddings (OllamaEmbeddings): Embedding model to use.
        url (str): URL of the Qdrant instance.
        api_key (str): API key for Qdrant access.
        
    Returns:
        QdrantVectorStore: Initialized Qdrant vector store."""
    if texts==None:
        qdrant = QdrantVectorStore.from_existing_collection(embedding=embeddings,
                                                            url=url,
                                                            prefer_grpc=False,
                                                            api_key=api_key,
                                                            collection_name=collection_name)
        print(f"Existed Collection: **{collection_name}** Initialized Successfully.")
    else:
        qdrant = QdrantVectorStore.from_documents(texts=texts,
                                                  embeddings=embeddings,
                                                  url=url,
                                                  prefer_grpc=False,
                                                  api_key=api_key,
                                                  collection_name=collection_name)
        print(f"Created **{collection_name}** Successfully.")
    return qdrant
qdrant = init_qdrant()

INFO: HTTP Request: GET https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333 "HTTP/1.1 200 OK"
INFO: HTTP Request: GET https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3 "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


Existed Collection: **DocumentsControl3** Initialized Successfully.


### Qu·∫ªry

In [66]:
results = qdrant.similarity_search("what is self-attention", k=2)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3/points/query "HTTP/1.1 200 OK"


* Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 27, 28, 22]. [{'links': [{'text': '4', 'url': 'cite.cheng2016long', 'start_index': 378}, {'text': '27', 'url': 'cite.decomposableAttnModel', 'start_index': 381}, {'url': 'cite.paulus2017deep', 'text': '28', 'start_index': 385}, {'text': '22', 'start_index': 389, 'url': 'cite.lin2017structured'}], 'page_number': 2, 'uuid': '592032b9-34bf-4f94-a598-ab65f2e25d4e', 'source': 'D:\\Projects\\documentsControl\\Data\\AttentionIsAllYouNeed.pdf', 'parent_id': '95b94863f46fadf14bb4b905d63fa743', '_id': '1af026d1-c002-4af0-85f1-8c90ab4bb0fd', '_collection_name': 'DocumentsControl3'}]
* 4 Why Self-Attention [{'source': 'D:

In [None]:
retriever = qdrant.as_retriever(search_type="mmr", search_kwargs={"k": 5})
a = retriever.invoke("what is self-attention")

INFO: HTTP Request: GET https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3 "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3/points/query "HTTP/1.1 200 OK"


In [68]:
a

[Document(metadata={'links': [{'text': '4', 'url': 'cite.cheng2016long', 'start_index': 378}, {'text': '27', 'url': 'cite.decomposableAttnModel', 'start_index': 381}, {'url': 'cite.paulus2017deep', 'text': '28', 'start_index': 385}, {'text': '22', 'start_index': 389, 'url': 'cite.lin2017structured'}], 'page_number': 2, 'uuid': '592032b9-34bf-4f94-a598-ab65f2e25d4e', 'source': 'D:\\Projects\\documentsControl\\Data\\AttentionIsAllYouNeed.pdf', 'parent_id': '95b94863f46fadf14bb4b905d63fa743', '_id': '1af026d1-c002-4af0-85f1-8c90ab4bb0fd', '_collection_name': 'DocumentsControl3'}, page_content='Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 27, 28, 22].'),
 Docu

# 4. Return

In [69]:
ret = retriever.invoke("what is self-attention")

INFO: HTTP Request: GET https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3 "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3/points/query "HTTP/1.1 200 OK"


In [70]:
for i in ret:
    print(f"* [{i.metadata}] \n {i.page_content} \n\n")

* [{'links': [{'text': '4', 'url': 'cite.cheng2016long', 'start_index': 378}, {'text': '27', 'url': 'cite.decomposableAttnModel', 'start_index': 381}, {'url': 'cite.paulus2017deep', 'text': '28', 'start_index': 385}, {'text': '22', 'start_index': 389, 'url': 'cite.lin2017structured'}], 'page_number': 2, 'uuid': '592032b9-34bf-4f94-a598-ab65f2e25d4e', 'source': 'D:\\Projects\\documentsControl\\Data\\AttentionIsAllYouNeed.pdf', 'parent_id': '95b94863f46fadf14bb4b905d63fa743', '_id': '1af026d1-c002-4af0-85f1-8c90ab4bb0fd', '_collection_name': 'DocumentsControl3'}] 
 Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 27, 28, 22]. 


* [{'source': 'D:\\Projects\\docu

# 5. Chat 

In [None]:
from google import genai
from google.genai import types
client = genai.Client(api_key="")

response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents="Explain how AI works in a few words",
    config=types.GenerateContentConfig(
        thinking_config=types.ThinkingConfig(thinking_budget=0) # Disables thinking
    ),
)
print(response.text)

TypeError: 'Models' object is not callable

In [71]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

template = """Answer the user's question based on the provided context.

-----------
{{context}}
-----------

Question: {question}

Remember: 
- If the question is not related to the context, you should answer "Hmmm... I'm not sure".
"""

prompt = ChatPromptTemplate.from_template(template)

model = OllamaLLM(model="qwen2.5:1.5B")

chain = prompt | model

chain.invoke({"question": "Lang Chain l√† g√¨"})

INFO: HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


'LangChain l√† m·ªôt m√¥ h√¨nh m·∫°ng l∆∞·ªõi c·ªßa c√°c m√¥ h√¨nh ng√¥n ng·ªØ ƒë∆∞·ª£c x√¢y d·ª±ng ƒë·ªÉ chia s·∫ª v√† chia s·∫ª th√¥ng tin m·ªôt c√°ch hi·ªáu qu·∫£.'

In [5]:
from langchain_core.prompts import ChatPromptTemplate
# Thay th·∫ø OllamaLLM b·∫±ng ChatGoogleGenerativeAI
from langchain_google_genai import ChatGoogleGenerativeAI
import os

# ƒê·∫∑t API Key c·ªßa b·∫°n v√†o bi·∫øn m√¥i tr∆∞·ªùng
# Ho·∫∑c c·∫•u h√¨nh genai.configure() tr∆∞·ªõc khi ch·∫°y
os.environ["GOOGLE_API_KEY"] = "AIzaSyCnc1PCwmvB2NPkULOcizOpzD2Qqaj1MmI" 

template = """Tr·∫£ l·ªùi c√¢u h·ªèi c·ªßa ng∆∞·ªùi d√πng d·ª±a tr√™n ng·ªØ c·∫£nh ƒë∆∞·ª£c cung c·∫•p.

-----------
{context}
-----------

C√¢u h·ªèi: {question}

L∆∞u √Ω: 
- N·∫øu c√¢u h·ªèi kh√¥ng li√™n quan ƒë·∫øn ng·ªØ c·∫£nh, b·∫°n n√™n tr·∫£ l·ªùi "Hmmm... T√¥i kh√¥ng ch·∫Øc ch·∫Øn".
"""

prompt = ChatPromptTemplate.from_template(template)

# S·ª≠ d·ª•ng ChatGoogleGenerativeAI thay v√¨ OllamaLLM
# B·∫°n c√≥ th·ªÉ ch·ªâ ƒë·ªãnh model l√† "gemini-pro" ho·∫∑c "gemini-pro-vision" ho·∫∑c "gemini-flash"
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash") 

# N·∫øu b·∫°n mu·ªën truy·ªÅn context r·ªóng (nh∆∞ v√≠ d·ª• g·ªëc c·ªßa b·∫°n kh√¥ng c√≥ context)
# b·∫°n c√≥ th·ªÉ t·∫°o m·ªôt chu·ªói ƒë∆°n gi·∫£n
chain = prompt | model

# T·∫°o m·ªôt context m·∫´u ƒë·ªÉ minh h·ªça
context_data = """LangChain l√† m·ªôt framework m√£ ngu·ªìn m·ªü ƒë∆∞·ª£c thi·∫øt k·∫ø ƒë·ªÉ gi√∫p c√°c nh√† ph√°t tri·ªÉn x√¢y d·ª±ng c√°c ·ª©ng d·ª•ng d·ª±a tr√™n m√¥ h√¨nh ng√¥n ng·ªØ l·ªõn (LLM). N√≥ cung c·∫•p c√°c c√¥ng c·ª• ƒë·ªÉ k·∫øt n·ªëi LLM v·ªõi c√°c ngu·ªìn d·ªØ li·ªáu b√™n ngo√†i, cho ph√©p t·∫°o ra c√°c ·ª©ng d·ª•ng m·∫°nh m·∫Ω v√† c√≥ kh·∫£ nƒÉng t∆∞∆°ng t√°c cao. LangChain h·ªó tr·ª£ nhi·ªÅu LLM kh√°c nhau v√† cung c·∫•p c√°c th√†nh ph·∫ßn ƒë·ªÉ x√¢y d·ª±ng chu·ªói logic ph·ª©c t·∫°p."""

response = chain.invoke({"question": "LangChain l√† g√¨?", "context": context_data})

print(response.content)

# V√≠ d·ª• v·ªõi c√¢u h·ªèi kh√¥ng li√™n quan ƒë·∫øn context
response_irrelevant = chain.invoke({"question": "Th·ªß ƒë√¥ c·ªßa Ph√°p l√† g√¨?", "context": context_data})
print(response_irrelevant.content)

LangChain l√† m·ªôt framework m√£ ngu·ªìn m·ªü ƒë∆∞·ª£c thi·∫øt k·∫ø ƒë·ªÉ gi√∫p c√°c nh√† ph√°t tri·ªÉn x√¢y d·ª±ng c√°c ·ª©ng d·ª•ng d·ª±a tr√™n m√¥ h√¨nh ng√¥n ng·ªØ l·ªõn (LLM).
Hmmm... T√¥i kh√¥ng ch·∫Øc ch·∫Øn.


# 6. Boto3


/home/hung/projects/myenv/lib/python3.10/site-packages/langchain_community/document_loaders/s3_file.py

In [None]:
from langchain_community.document_loaders import S3FileLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
#Load documents from S3 bucket
import uuid
print("Loading documents from S3...")

UUID1 = uuid.uuid4().__str__()
loader = S3FileLoader(bucket="bibox-bucket", 
                        key="123456", 
                        id=UUID1,
                        aws_access_key_id="test", 
                        aws_secret_access_key="test", 
                        endpoint_url="http://localhost:4566",
                        chunking_strategy="by_title", strategy="auto",
                        chunk_size=100, chunk_overlap=200
)

documents = loader.load_and_split()
#add uuid to each document metadata



print("-"*80)
print(f"Loaded {len(documents)} documents from S3.")
# Print the first document's content
print("-"*80)
print("First document content:", documents[0].page_content[:100])  # Print first 100 characters





Loading documents from S3...
--------------------------------------------------------------------------------
Loaded 2 documents from S3.
--------------------------------------------------------------------------------
First document content: Tuy·ªát v·ªùi! ƒêo·∫°n output b·∫°n cung c·∫•p cho th·∫•y UnstructuredLoader ƒë√£ ho·∫°t ƒë·ªông th√†nh c√¥ng v√† tr√≠ch xu·∫•


In [73]:
for doc in documents:
    print(doc.metadata)


{'source': 's3://bibox-bucket/123456'}
{'source': 's3://bibox-bucket/123456'}


In [74]:
import tempfile  
import os
import boto3 
from langchain_unstructured import UnstructuredLoader
s3 = boto3.client('s3',
                    region_name='us-east-1',
                    aws_access_key_id='test',
                    aws_secret_access_key="test",
                    endpoint_url="http://localhost:4566")

bucket_name = 'bibox-bucket'
s3_key = '123456'

with tempfile.TemporaryDirectory() as temp_dir:
    file_path = f"{temp_dir}/{s3_key}"
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    s3.download_file(bucket_name, s3_key, file_path)
    print(f"Downloaded to {file_path}")
    
    loader = UnstructuredLoader(file_path, chunking_strategy="by_title", strategy="auto")
    docs = loader.load()
    print(f"Loaded {len(docs)} documents.")
    print(f"First document:\n{docs[0].metadata},\n, {docs[0].page_content[:100]}...")
    #only keep "source" key in metadata, delete others, add UUID
    UUID1 = uuid.uuid4().__str__()
    for doc in docs:
        keys_to_remove = [key for key in doc.metadata if key != "source"]
        for key in keys_to_remove:
            del doc.metadata[key]
        doc.metadata["uuid"] = UUID1
    print("\n\n", "-"*80)
    print(f"First document:\n{docs[0].metadata},\n, {docs[0].page_content[:100]}...")
    


Downloaded to /tmp/tmppbza86bw/123456
Loaded 13 documents.
First document:
{'source': '/tmp/tmppbza86bw/123456', 'file_directory': '/tmp/tmppbza86bw', 'filename': '123456', 'languages': ['eng'], 'last_modified': '2025-07-09T16:11:41', 'page_number': 1, 'orig_elements': 'eJzVV02P2zYQ/SusLkkB0+U3JV8TFCgQBAG6yWWzMPhpCbAlQ6Y26wT97yUp2WsnbrMXB/DBMt9wSFHz3gzJ+2+FW7uNa8OyscUCFBIpgpmxkHmD4kN6WGmJIEKa48qSUpqqmIFi44KyKqg45lthuq63TauC22W8VvtuCMvaNas6RIusyBzFQZP9S2NDHc0Cj+Zt17Qhjby/l9ki2Zw9zMCEMKJzmiCn5Vz8iLN3xMVuvwtuk77iQ/Pk1n9vlXHFP7HDN2u3tE3vTOj6fXL4I2y26bfVX1Up9Jdi8mrVxqV+TCjjoshrbleDWuUvuy9cuyoesnUXlpvONr5xOW4EEQ6RhKi6w2KB8YLhNHobRy7bYaNdH71wWkxwTykmxd2w/zxgZ2QAj6lhbfMb+DwgjFGXsMItiNHaDgHoAzZDuwImI74Fpu5AqEe0Bx/bXegHE4be2Xedsq4fZ8PxiRwF9TRrOJrjO6s4X5oieqC2TlMj5Fk0Po42EPrcsKYGT8P4qgDacWwDbFpPSMjp8d/ILfjw9s9xkU6q49rnKRxhv83h/diaKJZV1zdfnb1LAYmR+V6KpdFYSqUgto5Dxh2ClRUSOkZxSTEWiomrSzG+ZhTbBEk5Qsr5nF/A2f32xPgms4zNqTZwkyhFWKPMJY1Uv+3MkBh6Vtax14CmBb2agU0eLJufTGWycF2lFqfCuGvC2l0Sg5KcoigHKKPwIKOOQlUqD

In [75]:
from pathlib import Path
import tempfile
import boto3
import uuid
from langchain_unstructured import UnstructuredLoader
from typing import Any, Callable, Optional, Dict

class S3DocumentLoader:
    def __init__(
        self, 
        bucket: str, 
        key: str, 
        *,

        ## Optional parameters for S3 connection and document processing
        region_name: Optional[str] = "us-east-1",
        aws_access_key_id: Optional[str] = "test",
        aws_secret_access_key: Optional[str] = "test",
        endpoint_url: str = "http://localhost:4566",

        ## Parameters for document loading and splitting
        chunking_strategy: Optional[str] = "by_title",
        strategy: Optional[str] = "auto",
        max_characters: Optional[int] = 1000,
        overlap: Optional[int] = 200,

        ## Optional function to add custom metadata to documents
        # The function should accept a dictionary and return a dictionary
        # This allows for flexible metadata handling, such as adding custom keys or values
        # If None, no additional metadata will be added
        add_to_metadata: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None
        ):

        """Initialize the S3DocumentLoader with bucket and key."""

        self.bucket = bucket
        self.key = key
        self.uuid = str(uuid.uuid4())
        self.region_name = region_name
        self.aws_access_key_id = aws_access_key_id
        self.aws_secret_access_key = aws_secret_access_key
        self.endpoint_url = endpoint_url
        self.chunking_strategy = chunking_strategy
        self.strategy = strategy
        self.max_characters = max_characters
        self.overlap = overlap
        self.add_to_metadata = add_to_metadata
        self.s3 = boto3.client(
            "s3",
            region_name=self.region_name,
            aws_access_key_id=self.aws_access_key_id,
            aws_secret_access_key=self.aws_secret_access_key,
            endpoint_url=endpoint_url,
        )
        self.docs = []

    def download(self, temp_dir: str):
        self.local_path = Path(temp_dir) / Path(self.key).name
        self.local_path.parent.mkdir(parents=True, exist_ok=True)
        self.s3.download_file(self.bucket, self.key, str(self.local_path))
        print(f"‚úÖ Downloaded: s3://{self.bucket}/{self.key} ‚Üí {self.local_path}")

    def load_and_split(self):
        loader = UnstructuredLoader(
            str(self.local_path),
            chunking_strategy=self.chunking_strategy,
            strategy=self.strategy,
            max_characters=self.max_characters, overlap=self.overlap, 
        )
        self.docs = loader.load()

        print(f"‚úÖ Loaded {len(self.docs)} documents.")
        

    def clean_metadata(self):
        for doc in self.docs:
            doc.metadata = {
                "uuid": self.uuid,
                "source": doc.metadata.get("source", f"s3://{self.bucket}/{self.key}"),
            }

            if self.add_to_metadata:
                custom_metadata = self.add_to_metadata(doc.metadata.copy())
                if custom_metadata and isinstance(custom_metadata, dict):
                    doc.metadata.update(custom_metadata)
                else:
                    print(f"‚ö†Ô∏è Warning: add_to_metadata did not return a dictionary or returned None for doc {doc.metadata.get('source')}.")

    def preview(self, n: int = 1):
        for doc in self.docs[:n]:
            print("-" * 80)
            print("üìÑ Metadata:", doc.metadata)
            print("üìÑ Content preview:", doc.page_content[:100], "...\n")

    def run(self):
        with tempfile.TemporaryDirectory() as tmp:
            self.download(tmp)
            self.load_and_split()
            self.clean_metadata()
            self.preview(3)


# üëá S·ª≠ d·ª•ng class
if __name__ == "__main__":
    loader = S3DocumentLoader(bucket="bibox-bucket", key="123456", add_to_metadata=lambda metadata: {"custom_key": "custom_value"})
    loader.run()


‚úÖ Downloaded: s3://bibox-bucket/123456 ‚Üí /tmp/tmpw_ooo9zk/123456
‚úÖ Loaded 7 documents.
--------------------------------------------------------------------------------
üìÑ Metadata: {'uuid': '5c58fe2a-03be-41a3-b8de-6b7be080dd98', 'source': '/tmp/tmpw_ooo9zk/123456', 'custom_key': 'custom_value'}
üìÑ Content preview: Tuy·ªát v·ªùi! ƒêo·∫°n output b·∫°n cung c·∫•p cho th·∫•y UnstructuredLoader ƒë√£ ho·∫°t ƒë·ªông th√†nh c√¥ng v√† tr√≠ch xu·∫• ...

--------------------------------------------------------------------------------
üìÑ Metadata: {'uuid': '5c58fe2a-03be-41a3-b8de-6b7be080dd98', 'source': '/tmp/tmpw_ooo9zk/123456', 'custom_key': 'custom_value'}
üìÑ Content preview: o

links: (Trong m·ªôt s·ªë tr∆∞·ªùng h·ª£p) C√°c li√™n k·∫øt ƒë∆∞·ª£c t√¨m th·∫•y trong vƒÉn b·∫£n.

page_content: N·ªôi dun ...

--------------------------------------------------------------------------------
üìÑ Metadata: {'uuid': '5c58fe2a-03be-41a3-b8de-6b7be080dd98', 'source': '/tmp/tmpw_ooo9zk/123

In [76]:
loader.docs

[Document(metadata={'uuid': '5c58fe2a-03be-41a3-b8de-6b7be080dd98', 'source': '/tmp/tmpw_ooo9zk/123456', 'custom_key': 'custom_value'}, page_content="Tuy·ªát v·ªùi! ƒêo·∫°n output b·∫°n cung c·∫•p cho th·∫•y UnstructuredLoader ƒë√£ ho·∫°t ƒë·ªông th√†nh c√¥ng v√† tr√≠ch xu·∫•t n·ªôi dung t·ª´ t·ªáp PDF c·ªßa b·∫°n.\n\nC√°c ƒë·ªëi t∆∞·ª£ng Document ƒë∆∞·ª£c in ra, m·ªói ƒë·ªëi t∆∞·ª£ng ch·ª©a:\n\nmetadata: Ch·ª©a th√¥ng tin chi ti·∫øt v·ªÅ ƒëo·∫°n vƒÉn b·∫£n ƒë∆∞·ª£c tr√≠ch xu·∫•t, bao g·ªìm:\n\no source: ƒê∆∞·ªùng d·∫´n ƒë·∫øn t·ªáp PDF.\n\no coordinates: V·ªã tr√≠ c·ªßa ƒëo·∫°n vƒÉn b·∫£n tr√™n trang (trong h·ªá th·ªëng PixelSpace).\n\no file_directory, filename: Th√¥ng tin v·ªÅ t·ªáp.\n\no\n\nlanguages: Ng√¥n ng·ªØ ƒë∆∞·ª£c ph√°t hi·ªán (·ªü ƒë√¢y l√† 'eng').\n\no\n\nlast_modified: Th·ªùi gian s·ª≠a ƒë·ªïi l·∫ßn cu·ªëi c·ªßa t·ªáp.\n\no page_number: S·ªë trang m√† ƒëo·∫°n vƒÉn b·∫£n ƒë∆∞·ª£c tr√≠ch xu·∫•t t·ª´ ƒë√≥.\n\no filetype: Lo·∫°i t·ªáp (application/pdf).\n\no category: Lo·∫°i