# 1. Import libraries

In [3]:
from langchain_unstructured import UnstructuredLoader
from unstructured.cleaners.core import clean_extra_whitespace
import uuid
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 2. Load, Modify and Split documents

In [4]:
file_paths = [
    "Data/AttentionIsAllYouNeed.pdf",
]
loader = UnstructuredLoader(file_paths, chunking_strategy="by_title", strategy="auto")
docs = loader.load()
print(f"Loaded {len(docs)} documents.")

Loaded 112 documents.


In [5]:
UNNESSECARIES = ["coordinates", "file_directory", "filename", "languages", "last_modified", "filetype", "category", "element_id"]
for doc in docs:
    doc.metadata["uuid"] = uuid.uuid4().__str__()
    for key in UNNESSECARIES:
        if key in doc.metadata:
            del doc.metadata[key]
print(f"Processed {len(docs)} documents from {docs[0].metadata['source']}.")

Processed 112 documents from Data/AttentionIsAllYouNeed.pdf.


In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100, strip_whitespace=True)
texts = text_splitter.split_documents(docs)
print(f"Splitted into {len(texts)} chunks.")

Splitted into 112 chunks.


In [7]:
with open("texts.txt", "w", encoding="utf-8") as f:
    for i in texts:
        f.write(f"{i.metadata}\n")
        f.write(f"{i.page_content}\n")
        f.write("\n" + "-" * 80 + "\n\n")

In [8]:
print(len(texts))
print(len(docs))

112
112


# 3. Upsert Vector Database

### Qdrant Cloud and Ollama

In [9]:
from langchain_qdrant import QdrantVectorStore
from langchain_ollama import OllamaEmbeddings
import config


def init_qdrant(
        collection_name="DocumentsControl3", 
        embeddings=OllamaEmbeddings(model="bge-m3:latest"), 
        url=config.url, 
        api_key = config.api_key
    ):
    """Initialize Qdrant Vector Store with existing collection or create a new one.
    Args:
        collection_name (str): Name of the Qdrant collection.
        embeddings (OllamaEmbeddings): Embedding model to use.
        url (str): URL of the Qdrant instance.
        api_key (str): API key for Qdrant access.
        
    Returns:
        QdrantVectorStore: Initialized Qdrant vector store."""
    try:
        qdrant = QdrantVectorStore.from_existing_collection(embedding=embeddings,
                                                            url=url,
                                                            prefer_grpc=False,
                                                            api_key=api_key,
                                                            collection_name=collection_name)
        print(f"Existed Collection: **{collection_name}** Initialized Successfully.")
    except:
        qdrant = QdrantVectorStore.from_documents(texts,
                                                  embeddings,
                                                  url=url,
                                                  prefer_grpc=False,
                                                  api_key=api_key,
                                                  collection_name=collection_name
        )
        print(f"Created **{collection_name}** Successfully.")
    return qdrant
qdrant = init_qdrant()

INFO: HTTP Request: GET https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333 "HTTP/1.1 200 OK"
INFO: HTTP Request: GET https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3 "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


Existed Collection: **DocumentsControl3** Initialized Successfully.


### Quẻry

In [10]:
results = qdrant.similarity_search("what is self-attention", k=2)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3/points/query "HTTP/1.1 200 OK"


* Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 27, 28, 22]. [{'links': [{'text': '4', 'url': 'cite.cheng2016long', 'start_index': 378}, {'text': '27', 'url': 'cite.decomposableAttnModel', 'start_index': 381}, {'url': 'cite.paulus2017deep', 'text': '28', 'start_index': 385}, {'text': '22', 'start_index': 389, 'url': 'cite.lin2017structured'}], 'page_number': 2, 'uuid': '592032b9-34bf-4f94-a598-ab65f2e25d4e', 'source': 'D:\\Projects\\documentsControl\\Data\\AttentionIsAllYouNeed.pdf', 'parent_id': '95b94863f46fadf14bb4b905d63fa743', '_id': '1af026d1-c002-4af0-85f1-8c90ab4bb0fd', '_collection_name': 'DocumentsControl3'}]
* 4 Why Self-Attention [{'source': 'D:

In [11]:
retriever = qdrant.as_retriever(search_type="mmr", search_kwargs={"k": 5})
a = retriever.invoke("what is self-attention")

INFO: HTTP Request: GET https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3 "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3/points/query "HTTP/1.1 200 OK"


In [12]:
a

[Document(metadata={'links': [{'text': '4', 'url': 'cite.cheng2016long', 'start_index': 378}, {'text': '27', 'url': 'cite.decomposableAttnModel', 'start_index': 381}, {'url': 'cite.paulus2017deep', 'text': '28', 'start_index': 385}, {'text': '22', 'start_index': 389, 'url': 'cite.lin2017structured'}], 'page_number': 2, 'uuid': '592032b9-34bf-4f94-a598-ab65f2e25d4e', 'source': 'D:\\Projects\\documentsControl\\Data\\AttentionIsAllYouNeed.pdf', 'parent_id': '95b94863f46fadf14bb4b905d63fa743', '_id': '1af026d1-c002-4af0-85f1-8c90ab4bb0fd', '_collection_name': 'DocumentsControl3'}, page_content='Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 27, 28, 22].'),
 Docu

# 4. Return

In [13]:
ret = retriever.invoke("what is self-attention")

INFO: HTTP Request: GET https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3 "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://da84a490-2db4-41eb-a610-9e96795692ce.us-east4-0.gcp.cloud.qdrant.io:6333/collections/DocumentsControl3/points/query "HTTP/1.1 200 OK"


In [14]:
for i in ret:
    print(f"* [{i.metadata}] \n {i.page_content} \n\n")

* [{'links': [{'text': '4', 'url': 'cite.cheng2016long', 'start_index': 378}, {'text': '27', 'url': 'cite.decomposableAttnModel', 'start_index': 381}, {'url': 'cite.paulus2017deep', 'text': '28', 'start_index': 385}, {'text': '22', 'start_index': 389, 'url': 'cite.lin2017structured'}], 'page_number': 2, 'uuid': '592032b9-34bf-4f94-a598-ab65f2e25d4e', 'source': 'D:\\Projects\\documentsControl\\Data\\AttentionIsAllYouNeed.pdf', 'parent_id': '95b94863f46fadf14bb4b905d63fa743', '_id': '1af026d1-c002-4af0-85f1-8c90ab4bb0fd', '_collection_name': 'DocumentsControl3'}] 
 Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 27, 28, 22]. 


* [{'source': 'D:\\Projects\\docu

# 5. Chat 

In [15]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

template = """Answer the user's question based on the provided context.

-----------
{{context}}
-----------

Question: {question}

Remember: 
- If the question is not related to the context, you should answer "Hmmm... I'm not sure".
"""

prompt = ChatPromptTemplate.from_template(template)

model = OllamaLLM(model="qwen2.5:1.5B")

chain = prompt | model

chain.invoke({"question": "Lang Chain là gì"})

KeyboardInterrupt: 

# 6. Boto3


In [None]:
from langchain_community.document_loaders import S3FileLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

#Load documents from S3 bucket
print("Loading documents from S3...")

loader = S3FileLoader(bucket="bibox-bucket", 
                        key="123456", 
                        aws_access_key_id="test", 
                        aws_secret_access_key="test", 
                        endpoint_url="http://localhost:4566",
                        chunking_strategy="by_title", strategy="auto",
                        chunk_size=1000, chunk_overlap=200
)

documents = loader.load_and_split()
#add uuid to each document metadata
UUID1 = uuid.uuid4().__str__()
for doc in documents:
    doc.metadata["uuid"] = UUID1

print("-"*80)
print(f"Loaded {len(documents)} documents from S3.")
# Print the first document's content
print("-"*80)
print("First document content:", documents[0].page_content[:100])  # Print first 100 characters





Loading documents from S3...


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'uuid' is not defined

In [None]:
documents

[Document(metadata={'source': 's3://bibox-bucket/123456', 'uuid': 'cf3de6c6-de1d-47cb-9d5b-14a4c060e1f4'}, page_content='3 2 0 2\n\ng u A 2\n\n] L C . s c [\n\n7 v 2 6 7 3 0 . 6 0 7 1 : v i X r a\n\nProvided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.\n\nAttention Is All You Need\n\nAshish Vaswani∗ Google Brain avaswani@google.com\n\nNoam Shazeer∗ Google Brain noam@google.com\n\nNiki Parmar∗ Google Research nikip@google.com\n\nJakob Uszkoreit∗ Google Research usz@google.com\n\nLlion Jones∗ Google Research llion@google.com\n\nAidan N. Gomez∗ † University of Toronto aidan@cs.toronto.edu\n\nŁukasz Kaiser∗ Google Brain lukaszkaiser@google.com\n\nIllia Polosukhin∗ ‡ illia.polosukhin@gmail.com\n\nAbstract\n\nThe dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing mo