In [1]:
!pip install langchain openai chromadb tiktoken pypdf sentence-transformers

Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl.metadata (13 kB)
Collecting openai
  Downloading openai-1.23.2-py3-none-any.whl.metadata (21 kB)
Collecting chromadb
  Downloading chromadb-0.4.24-py3-none-any.whl.metadata (7.3 kB)
Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting langchain-community<0.1,>=0.0.32 (from langchain)
  Downloading langchain_community-0.0.34-py3-none-any.whl.metadata (8.5 kB)
Collecting langchain-core<0.2.0,>=0.1.42 (from langchain)
  Downloading langchain_core-0.1.45-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)
  Downloading langchain_text_splitters-0.0.1-py3-none-any.whl.metadata (2.0 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.49-py3-none-any.whl.metadata (13 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.1-py3-none-any.whl.metadata (4

In [57]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFacePipeline
from langchain.llms import HuggingFaceHub
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [58]:
from langchain.chains.question_answering import load_qa_chain
import torch

# load document
loader = PyPDFLoader("/kaggle/input/pdf-data/transformers.pdf")
documents = loader.load()

### For multiple documents 
# loaders = [....]
# documents = []
# for loader in loaders:
#     documents.extend(loader.load())

llm = HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"max_length":512})

In [59]:
## chain_type = "stuff" won't work
chain = load_qa_chain(llm, chain_type="map_reduce")
query = "explain dot-product attention?"
chain.run(input_documents=documents, question=query)

'An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors.'

In [60]:
chain



In [None]:
## map_reduce
## refine
## map_rerank

## downside: uses a lot of tokens, all of the tokens in the pdf file
## better solution: retrieve relevant chunks, language model only looks through a small subset of the text

In [64]:
# load document
loader = PyPDFLoader("/kaggle/input/pdf-data/transformers.pdf")
documents = loader.load()
# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
# select which embeddings we want to use


from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)

In [65]:
# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":2})
# create a chain to answer questions 
qa = RetrievalQA.from_chain_type(
    llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
query = "what all datasets were used for the experimentations?"
result = qa({"query": query})

In [66]:
result

{'query': 'what all datasets were used for the experimentations?',
 'result': 'Attention Visualizations Input-Input Layer5',
 'source_documents': [Document(page_content='Attention Visualizations\nInput-Input Layer5\nIt\nis\nin\nthis\nspirit\nthat\na\nmajority\nof\nAmerican\ngovernments\nhave\npassed\nnew\nlaws\nsince\n2009\nmaking\nthe\nregistration\nor\nvoting\nprocess\nmore\ndifficult\n.\n<EOS>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\nIt\nis\nin\nthis\nspirit\nthat\na\nmajority\nof\nAmerican\ngovernments\nhave\npassed\nnew\nlaws\nsince\n2009\nmaking\nthe\nregistration\nor\nvoting\nprocess\nmore\ndifficult\n.\n<EOS>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\n<pad>\nFigure 3: An example of the attention mechanism following long-distance dependencies in the\nencoder self-attention in layer 5 of 6. Many of the attention heads attend to a distant dependency of\nthe verb ‘making’, completing the phrase ‘making...more difficult’. Attentions here shown only for\nthe word ‘making’. Different color

In [67]:
result['result']

'Attention Visualizations Input-Input Layer5'

In [29]:
from langchain.chains import ConversationalRetrievalChain

In [34]:
# load document
loader = PyPDFLoader("/kaggle/input/pdf-data/transformers.pdf")
documents = loader.load()
# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)


embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)

retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":2})
# create a chain to answer questions 
qa = ConversationalRetrievalChain.from_llm(llm, retriever)
chat_history = []
query = "what all datasets were used for the experimentations?"
result = qa({"question": query, "chat_history": chat_history})

In [35]:
result

{'question': 'what all datasets were used for the experimentations?',
 'chat_history': [],
 'answer': 'Attention Visualizations Input-Input Layer5'}

In [36]:
chat_history = [(query, result["answer"])]
query = "What were those about?"
result = qa({"question": query, "chat_history": chat_history})

In [37]:
result

{'question': 'What were those about?',
 'chat_history': [('what all datasets were used for the experimentations?',
   'Attention Visualizations Input-Input Layer5')],
 'answer': 'Attention Visualizations'}