In [None]:
import os
os.environ["USER_AGENT"] = "genai-rag-app/1.0 (educational project)"

# Document loaders
from langchain_community.document_loaders import (
    PyPDFLoader,
    WebBaseLoader
)

# Text splitting
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Embeddings + Vector DB
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS

DOCUMENT LOADERS

1. PyPDF Loader
2. Web Based Loader
3. Wikipedia Loader

In [2]:
def load_pdf(pdf):
    
    loader = PyPDFLoader(pdf)
    docs = loader.load()
    
    print(f"PDF loaded successfully")
    print(f"Total pages: {len(docs)}")
    
    return docs


In [3]:
pdf_docs = load_pdf('data/Attention_is_all_you_need.pdf')


PDF loaded successfully
Total pages: 15


In [4]:
for i in range(1, 5):
    
    print("Page no.: ", i)
    
    text = pdf_docs[i].page_content
    print("Number of characters: ", len(text))
    
    print(text[:200])
    

Page no.:  1
Number of characters:  4253
transduction problems such as language modeling and machine translation [ 35, 2, 5]. Numerous
efforts have since continued to push the boundaries of recurrent language models and encoder-decoder
archi
Page no.:  2
Number of characters:  1745
Figure 1: The Transformer - model architecture.
3.1 Encoder and Decoder Stacks
Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two
sub-layers. The ﬁrst is a multi
Page no.:  3
Number of characters:  2482
Scaled Dot-Product Attention
 Multi-Head Attention
Figure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several
attention layers running in parallel.
3.2.1 Scaled Do
Page no.:  4
Number of characters:  3286
Multi-head attention allows the model to jointly attend to information from different representation
subspaces at different positions. With a single attention head, averaging inhibits this.
MultiHead(


In [5]:
def load_urls(url1):
    
    loader = WebBaseLoader(web_path=url1)
        
    docs = loader.load()
    return docs

In [6]:
url_docs = load_urls('https://www.tpointtech.com/java-oops-concepts')
url_docs

[Document(metadata={'source': 'https://www.tpointtech.com/java-oops-concepts', 'title': 'OOPs (Object-Oriented Programming) Concepts in Java - Tpoint Tech', 'description': 'Object-oriented programming is a paradigm that provides concepts, such as inheritance, data binding, polymorphism, etc.', 'language': 'en'}, page_content='OOPs (Object-Oriented Programming) Concepts in Java - Tpoint Tech  Tutorials×Python Technologies Python Tutorial Django Tutorial Numpy Tutorial Pandas Tutorial Tkinter Tutorial Pytorch Tutorial Flask Tutorial OpenCV Tutorial Java Technologies Java Tutorial Servlet Tutorial JSP Tutorial Spring Boot Tutorial Spring Framework Tutorial Hibernate Tutorial JavaFX Tutorial Java Web Services Tutorial .Net Framework .Net Framework Tutorial C# Tutorial ASP.Net Tutorial ADO.Net Tutorial WPF TutorialAI, ML & Data Science Artificial Intelligence Tutorial Machine Learning Tutorial Data Science Tutorial Deep Learning Tutorial TensorFlow Tutorial Artificial Neural Network Tutoria

TEXT SPLITTING:

RecursiveCharacterTextSplitter

In [7]:
def split_documents(docs):
    
    splitter = RecursiveCharacterTextSplitter(
                    chunk_size=400, 
                    chunk_overlap=80
                )
    
    chunks = splitter.split_documents(docs)
    
    print("Total chunks created: ",{len(chunks)})
    print("Sample chunk text:\n", chunks[0].page_content[:300])

    return chunks
    
    

In [8]:
pdf_chunks = split_documents(pdf_docs)
len(pdf_chunks)

Total chunks created:  {125}
Sample chunk text:
 Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aid


125

In [9]:
def validate_split(chunks):

    lengths = [len(chunk.page_content) for chunk in chunks]

    print(f"Min chunk length: {min(lengths)}")
    print(f"Max chunk length: {max(lengths)}")
    print(f"Average chunk length: {sum(lengths)//len(lengths)}")


In [10]:
validate_split(pdf_chunks)

Min chunk length: 14
Max chunk length: 398
Average chunk length: 330


In [11]:
url_chunks = split_documents(url_docs)
len(url_docs)

Total chunks created:  {79}
Sample chunk text:
 OOPs (Object-Oriented Programming) Concepts in Java - Tpoint Tech  Tutorials×Python Technologies Python Tutorial Django Tutorial Numpy Tutorial Pandas Tutorial Tkinter Tutorial Pytorch Tutorial Flask Tutorial OpenCV Tutorial Java Technologies Java Tutorial Servlet Tutorial JSP Tutorial Spring Boot T


1

In [12]:
validate_split(url_chunks)

Min chunk length: 92
Max chunk length: 400
Average chunk length: 365


EMBEDDING AND VECTOR STORAGE


VectorDB -> FAISS

In [13]:
def embed_and_vector_store(chunks):
    
    embeddings = OllamaEmbeddings(
        model = 'nomic-embed-text:v1.5'
    )
    
    vector_store = FAISS.from_documents(chunks, embeddings)
    
    print("Vector store created")
    print("Number of vectors stored: ",{vector_store.index.ntotal})
    
    return vector_store

In [14]:
pdf_vector_store = embed_and_vector_store(pdf_chunks)

Vector store created
Number of vectors stored:  {125}


In [15]:
url_vector_store = embed_and_vector_store(url_chunks)

Vector store created
Number of vectors stored:  {79}


RETRIEVER 

In [16]:
def retrieve_chunks(vector_store, query, k=3):
    
    retriever = vector_store.as_retriever(
        search_kwargs={"k": k}
    )
    
    results = retriever.invoke(query)

    print("Query: ", query)
    
    for i, doc in enumerate(results):
        print(f"\n--- Retrieved Chunk {i+1} ---")
        print(doc.page_content[:400])
    
    return results


In [17]:
retrieve_chunks(
    pdf_vector_store,
    "What is encoder?"
)


Query:  What is encoder?

--- Retrieved Chunk 1 ---
typical encoder-decoder attention mechanisms in sequence-to-sequence models such as
[38, 2, 9].
• The encoder contains self-attention layers. In a self-attention layer all of the keys, values
and queries come from the same place, in this case, the output of the previous layer in the
encoder. Each position in the encoder can attend to all positions in the previous layer of the
encoder.

--- Retrieved Chunk 2 ---
Figure 1: The Transformer - model architecture.
3.1 Encoder and Decoder Stacks
Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two
sub-layers. The ﬁrst is a multi-head self-attention mechanism, and the second is a simple, position-
wise fully connected feed-forward network. We employ a residual connection [11] around each of

--- Retrieved Chunk 3 ---
encoder.
• Similarly, self-attention layers in the decoder allow each position in the decoder to attend to
all positions in the decoder up to

[Document(id='ac91f8f1-1cba-454e-a8f1-fa9d78758b82', metadata={'producer': 'pdfTeX-1.40.17', 'creator': 'LaTeX with hyperref package', 'creationdate': '2017-12-07T01:03:15+00:00', 'author': '', 'keywords': '', 'moddate': '2017-12-07T01:03:15+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data/Attention_is_all_you_need.pdf', 'total_pages': 15, 'page': 4, 'page_label': '5'}, page_content='typical encoder-decoder attention mechanisms in sequence-to-sequence models such as\n[38, 2, 9].\n• The encoder contains self-attention layers. In a self-attention layer all of the keys, values\nand queries come from the same place, in this case, the output of the previous layer in the\nencoder. Each position in the encoder can attend to all positions in the previous layer of the\nencoder.'),
 Document(id='0d8a3270-71c9-4a38-ab22-1387cd161997', metadata={'producer': 'pdfTeX-1.

In [18]:
retrieve_chunks(
    url_vector_store,
    "cohesion"
)

Query:  cohesion

--- Retrieved Chunk 1 ---
The weakly cohesive method will split the task into separate parts. The java.io package is highly cohesive because it has I/O related classes and interfaces. However, the java.util package is a weakly cohesive package because it has unrelated classes and interfaces.To read more Cohesion in JavaAssociationThe association represents the relationship between the objects. Here, one object can be

--- Retrieved Chunk 2 ---
is also a way to achieve Association. The composition represents the relationship where one object contains other objects as a part of its state. There is a strong relationship between the containing object and the dependent object. It is the state where objects do not have an independent existence. If we delete the parent object, all the child objects will be deleted automatically.To read more

--- Retrieved Chunk 3 ---
of a class, method, and field. We can use interfaces for the weaker coupling because there is no concrete imp

[Document(id='446632f9-93cf-43c9-9b9c-ab4180b8fc74', metadata={'source': 'https://www.tpointtech.com/java-oops-concepts', 'title': 'OOPs (Object-Oriented Programming) Concepts in Java - Tpoint Tech', 'description': 'Object-oriented programming is a paradigm that provides concepts, such as inheritance, data binding, polymorphism, etc.', 'language': 'en'}, page_content='The weakly cohesive method will split the task into separate parts. The java.io package is highly cohesive because it has I/O related classes and interfaces. However, the java.util package is a weakly cohesive package because it has unrelated classes and interfaces.To read more Cohesion in JavaAssociationThe association represents the relationship between the objects. Here, one object can be'),
 Document(id='251928a2-e5cc-41c9-9fb4-ec8f90d4f19d', metadata={'source': 'https://www.tpointtech.com/java-oops-concepts', 'title': 'OOPs (Object-Oriented Programming) Concepts in Java - Tpoint Tech', 'description': 'Object-oriented

LLM 

Model Used (OLLAMA):

1. llama3.2:1b
2. nomic-embed-text:v1.5

In [19]:
from langchain_ollama import ChatOllama

llm = ChatOllama(
    model="llama3.2:1b",
    temperature=0.2
)


In [20]:
from langchain_core.prompts import ChatPromptTemplate

rag_prompt = ChatPromptTemplate.from_template(
    
    """You are a helpful assistant. Answer the question using the context below.
Use only the information from the context. If the context is insufficient,
give the best possible answer based on it."

Context:
{context}

Question:
{question}

Answer:"""
)


In [21]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [22]:
def rag_answer(vector_store, question, k=2):
    
    retriever = vector_store.as_retriever(search_kwargs={"k": k})
    docs = retriever.invoke(question)

    context = format_docs(docs)
    
    messages = rag_prompt.format_messages(
        context=context,
        question=question
    )

    response = llm.invoke(messages)
    return response.content, docs


In [23]:
answer, sources = rag_answer(
    pdf_vector_store,
    "What is multi-head attention in transformers, explain deeply?"
)

print("ANSWER:\n", answer)

print("\nSOURCES:")
for s in sources:
    print(s.metadata)


ANSWER:
 Based on the context provided, I will attempt to explain multi-head attention in Transformers in detail.

Multi-head attention is a key component of the Transformer model, which allows it to jointly attend to information from different representation subspaces at different positions within an input sequence. This process enables the model to capture complex relationships between different parts of the input data.

To understand multi-head attention, let's break down its components:

1. **Query (Q)**: The query is a vector that represents the input data. In the context of Transformers, this can be thought of as the "what" or "question" being asked about the input sequence.
2. **Key (K)**: The key is a vector that represents the input data. Similar to the query, the key can be seen as the "what" or "information" being represented by the input sequence.
3. **Value (V)**: The value is another vector that represents the input data. Again, this can be thought of as the "why" or "rea