In [3]:
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from langchain_google_genai import (
    GoogleGenerativeAIEmbeddings, 
    ChatGoogleGenerativeAI
)
from langchain_openai import (
    ChatOpenAI, 
    OpenAIEmbeddings
)

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(
    find_dotenv()
)

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")


data_path = "Data\\Attention.pdf"
documents = PyPDFLoader(file_path=data_path).load()
print("Total pages found :- {}\n".format(len(documents)))

splitter = RecursiveCharacterTextSplitter(
    chunk_size=512, 
    chunk_overlap=64,
    length_function=len,
    is_separator_regex=True,
)

texts = splitter.split_documents(documents)
print("Total splitted documents chunks created are :- {}\n".format(len(texts)))


Total pages found :- 15

Total splitted documents chunks created are :- 92



In [4]:
for index, text in enumerate(texts) :
    text.metadata["id"] = index


embeddings_google = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001"
)
embeddings_openai = OpenAIEmbeddings()


llm_model_google = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash-001", 
    temperature=0.4, 
    max_tokens=1024, 
    top_p=0.9
)

llm_model_openai = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.4, 
    max_tokens=1024, 
    top_p=0.9
)

retriever = FAISS.from_documents(
    texts, embeddings_google,
).as_retriever(
    search_type="similarity",
    search_kwargs={"k" : 10}
)


In [15]:
question = """Explain the multi-headed attention as compared to self attention and 
masked multi-headed attention in a detailed and easy manner."""

retrieved_documents = retriever.invoke(question)
print(retrieved_documents)

[Document(id='46c9ce7e-6a4a-4c8f-b880-2ba768ff9ac1', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'Data\\Attention.pdf', 'total_pages': 15, 'page': 4, 'page_label': '5', 'id': 28}, page_content='i ∈ Rdmodel×dk , WK\ni ∈ Rdmodel×dk , WV\ni ∈ Rdmodel×dv\nand WO ∈ Rhdv×dmodel .\nIn this work we employ h = 8 parallel attention layers, or heads. For each of these we use\ndk = dv = dmodel/h = 64. Due to the reduced dimension of each head, the total computational cost\nis similar to that of single-head attention with full dimensionality.\n3.2.3 Applications of Attention in our Model\nThe Transformer uses multi-head attention in three different ways:'), Document(id='cb1c26d0-86dd-42cf-92

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_compressors import FlashrankRerank
from flashrank import Ranker 


ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2")

compressor = FlashrankRerank(
    score_threshold=0.2, 
    top_n=4
)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, 
    base_retriever=retriever,
)

compressed_documents = compression_retriever.invoke(
    question
)
print("Reranked Documents are :- {}".format(
    [doc.metadata["id"] for doc in compressed_documents]
    )
)

Reranked Documents are :- [27, 38, 13, 20]


In [17]:
from langchain.chains import RetrievalQA

chain = RetrievalQA.from_chain_type(
    llm=llm_model_google, 
    retriever=compression_retriever, 
    chain_type="stuff",
)

In [18]:
final_reranked_answer = chain.invoke(
    question
)

In [19]:
print(final_reranked_answer["result"])

Let's break down the differences between self-attention, multi-headed attention, and masked multi-headed attention.

**1. Self-Attention:**

* **Concept:** Imagine you have a sentence like "The cat sat on the mat." Self-attention helps the model understand the relationships between words in this sentence. It looks at each word and tries to figure out which other words are most relevant to it. For example, "cat" is likely to be related to "sat" and "mat."
* **How it works:**
    * **Input:** A sequence of words (like our sentence).
    * **Process:** Each word is transformed into three vectors: a query (Q), a key (K), and a value (V). The model calculates how similar each word's query is to every other word's key. This similarity score determines how much attention each word pays to the other words.  The final output is a weighted sum of all the values, where the weights are the attention scores.
* **Benefits:**  Self-attention allows the model to capture long-range dependencies in a se