In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.vectorstores import FAISS
from langchain import HuggingFaceHub
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.chains import RetrievalQA

import torch
import os
import warnings

warnings.filterwarnings('ignore')

In [2]:
pdf_file = PyPDFLoader("./Attention.pdf")

In [3]:
pages = pdf_file.load_and_split()

In [4]:
pages[2]

Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': './Attention.pdf', 'total_pages': 15, 'page': 1, 'page_label': '2'}, page_content='Most competitive neural sequence transduction models have an encoder-decoder structure [5, 2, 35].\nHere, the encoder maps an input sequence of symbol representations (x1, ..., xn) to a sequence\nof continuous representations z = (z1, ..., zn). Given z, the decoder then generates an output\nsequence (y1, ..., ym) of symbols one element at a time. At each step the model is auto-regressive\n[10], consuming the previously generated symbols as additional input when generating the next.\n2')

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap = 128,    
)

In [6]:
chunks = text_splitter.split_documents(pages)

In [7]:
len(chunks)

50

In [8]:
chunks[23]

Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': './Attention.pdf', 'total_pages': 15, 'page': 6, 'page_label': '7'}, page_content='length n is smaller than the representation dimensionality d, which is most often the case with\nsentence representations used by state-of-the-art models in machine translations, such as word-piece\n[38] and byte-pair [31] representations. To improve computational performance for tasks involving\nvery long sequences, self-attention could be restricted to considering only a neighborhood of size r in\nthe input sequence centered around the respective output position. This would increase the maximum\npath length to O(n/r). We plan to investigate th

In [9]:
Embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2",
                                   model_kwargs = {"device": "cuda"})

vector_db = FAISS.from_texts([str(chunk) for chunk in chunks], Embeddings)

In [10]:
question = '''
what is the purpose of the decoder?
'''

relevant_results = vector_db.similarity_search(question, k = 2)

In [11]:
relevant_results[1]

Document(id='5be55beb-0be1-4e24-9a45-7ea48d0b8e45', metadata={}, page_content='page_content=\'• In "encoder-decoder attention" layers, the queries come from the previous decoder layer,\nand the memory keys and values come from the output of the encoder. This allows every\nposition in the decoder to attend over all positions in the input sequence. This mimics the\ntypical encoder-decoder attention mechanisms in sequence-to-sequence models such as\n[38, 2, 9].\n• The encoder contains self-attention layers. In a self-attention layer all of the keys, values\nand queries come from the same place, in this case, the output of the previous layer in the\nencoder. Each position in the encoder can attend to all positions in the previous layer of the\nencoder.\n• Similarly, self-attention layers in the decoder allow each position in the decoder to attend to\nall positions in the decoder up to and including that position. We need to prevent leftward\ninformation flow in the decoder to preserve the 

In [23]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
Using this piece of information:
\n
{context}
\n
Answer the following question:
\n
{question}
\n
Answer:
\n 
"""
)

In [13]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4"
)

In [15]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b1", quantization_config = bnb_config)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [56]:
from transformers import pipeline

pipe = pipeline("text-generation", model = model, tokenizer = tokenizer, max_new_tokens = 144)
lc_pipeline = HuggingFacePipeline(pipeline = pipe)

Device set to use cuda:0


In [50]:
qa_chain = RetrievalQA.from_chain_type(
    llm = lc_pipeline,
    retriever = vector_db.as_retriever(search_kwargs = {'k':3}),
    return_source_documents = True,
    chain_type_kwargs = {'prompt' : prompt}
)

In [51]:
question = '''
what is the purpose of the decoder?
'''

result = qa_chain({"query": question})

In [52]:
print(result["result"].split("Answer:")[1])



 
The purpose of the decoder is to predict the next position in the input sequence. This is done by
attending to the output of the previous decoder layer. The decoder is a stack of N = 6
identical layers, each of which contains a self-attention layer. The decoder stack is
implemented using a residual connection around each of the two sub-layers, followed by
layer normalization. The decoder stack contains a single self-attention layer, which is
implemented using a residual connection around each of the two sub-l


In [53]:
question = '''
what is the attention function?
'''

result = qa_chain({"query": question})
print(result["result"].split("Answer:")[1])



 
The attention function is the softmax function of the query with the corresponding key.
The attention function is computed by a compatibility function of the query with the
corresponding key.
The attention function is computed by a compatibility function of the query with the
corresponding key.
The attention function is computed by a compatibility function of the query with the
corresponding key.
The attention function is computed by a compatibility function of the query with the
corresponding key.
The attention function is computed by a compatibility function of the query with the
correspond


In [57]:
question = '''
what are the three ways of using multi-head attention?
'''

result = qa_chain({"query": question})
print(result["result"].split("Answer:")[1])



 
1. The first way is to use the attention mechanism to attend to the
information from the previous decoder layer. This is the most common way of using
attention in the WMT 2014 dataset. The attention mechanism is used to attend to the
information from the previous decoder layer, and the attention head is used to attend to
the information from the next decoder layer. The attention mechanism is used to attend to
the information from the previous decoder layer, and the attention head is used to attend to
the information from the next decoder
