In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("./vaswani2017-AttentionIsAllYouNeed.pdf")
docs = loader.load()

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
documents = text_splitter.split_documents(docs)


In [5]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(documents[:30], OllamaEmbeddings())

In [6]:
query = "What is Attention Mechanism"
result = db.similarity_search(query)
print(result[0].page_content)

previous state-of-the-art model. The Transformer (big) model trained for English-to-French used
dropout rate Pdrop = 0.1, instead of 0.3.
For the base models, we used a single model obtained by averaging the last 5 checkpoints, which
were written at 10-minute intervals. For the big models, we averaged the last 20 checkpoints. We
used beam search with a beam size of 4 and length penalty α = 0.6 [38]. These hyperparameters
were chosen after experimentation on the development set. We set the maximum output length during
inference to input length + 50, but terminate early when possible [38].
Table 2 summarizes our results and compares our translation quality and training costs to other model
architectures from the literature. We estimate the number of floating point operations used to train a
model by multiplying the training time, the number of GPUs used, and an estimate of the sustained
single-precision floating-point capacity of each GPU 5.
6.2 Model Variations


In [9]:
from langchain_community.llms import Ollama

llm = Ollama(model="llama2")
llm

Ollama()

In [None]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("""
Answer following question based only on the provided context.
Think step by step before providing a detailed answer.
I will tip you $1000 if the user finds the answer is helpful.
<context> 
{context}
</context>
Question: {input}                                 
""")

In [14]:
# Chain / Document chain
from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain = create_stuff_documents_chain(llm, prompt)

In [16]:
# retriever
retriever = db.as_retriever()

In [18]:
# retrieval chain
from langchain.chains import create_retrieval_chain

retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [None]:
retrieval_chain.invoke({
    "input": "What is Attention Mechanism"
})

{'input': 'What is Attention Mechanism',
 'context': [Document(page_content='previous state-of-the-art model. The Transformer (big) model trained for English-to-French used\ndropout rate Pdrop = 0.1, instead of 0.3.\nFor the base models, we used a single model obtained by averaging the last 5 checkpoints, which\nwere written at 10-minute intervals. For the big models, we averaged the last 20 checkpoints. We\nused beam search with a beam size of 4 and length penalty α = 0.6 [38]. These hyperparameters\nwere chosen after experimentation on the development set. We set the maximum output length during\ninference to input length + 50, but terminate early when possible [38].\nTable 2 summarizes our results and compares our translation quality and training costs to other model\narchitectures from the literature. We estimate the number of floating point operations used to train a\nmodel by multiplying the training time, the number of GPUs used, and an estimate of the sustained\nsingle-precisio

In [34]:
response = retrieval_chain.invoke({
    "input": "What is the components of scaled dot-product attention"
})

In [33]:
from langchain_core.output_parsers import StrOutputParser
chain_wo_rag = ChatPromptTemplate.from_messages(
    [
        ("system","You are a helpful assistant. Please response to the user queries"),
        ("user","Question:{question}")
    ]
) | llm | StrOutputParser()

result_without_rag = chain_wo_rag.invoke({"question":"What is the components of scaled dot-product attention"})

In [35]:
result_with_rag = response["answer"]
print(f"[ANSWER WITH RAG CHAIN]\n{result_with_rag}")
print()
print(f"[ANSWER WITHOUT RAG CHAIN]\n{result_without_rag}")

[ANSWER WITH RAG CHAIN]
Based on the provided context, the components of scaled dot-product attention are:

1. Dot-product attention: This is a type of attention mechanism that computes the compatibility function between the query and key using a dot product. The scaling factor of 1√dk is used to speed up the computation.
2. Multi-head attention: This is an extension of the dot-product attention mechanism that allows the decoder to attend to all positions in the input sequence. It uses a combination of self-attention layers in the encoder and decoder to allow each position in the decoder to attend to all positions in the input sequence.
3. Convolutional layers: These are used to reduce sequential computation and improve computational performance for tasks involving very long sequences. A stack of O(n/k) convolutional layers is required when using contiguous kernels, while O(logk(n)) is required when using dilated convolutions.
4. Memory keys and values: These are used to store the outp