In [2]:
import os
from langchain.document_loaders import PyPDFLoader

In [32]:
DATA_FOLDER = './data'
loaders = []

def pdf_loader(data_folder=DATA_FOLDER):
    print([fn for fn in os.listdir(DATA_FOLDER) if fn.endswith('.pdf')])
    loaders = [PyPDFLoader(os.path.join(DATA_FOLDER, fn))
               for fn in os.listdir(DATA_FOLDER) if fn.endswith('.pdf')]
    print(f'{len(loaders)} file loaded')
    return loaders

In [33]:
pdf_loader()

['Data structures and algorithms in Python.pdf', 'CompetitiveProgramming.pdf', 'Natural Language Processing with Python.pdf', 'A First Book of C++, Fourth Edition.pdf', 'Deep-Learning-with-PyTorch.pdf']
5 file loaded


[<langchain_community.document_loaders.pdf.PyPDFLoader at 0x13684fd50>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x1300f96d0>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x1300fa210>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x1300f8b50>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x1300f8bd0>]

In [34]:
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
from langchain.embeddings import OpenAIEmbeddings, VertexAIEmbeddings
from langchain.vectorstores import Chroma

In [39]:
import os

os.environ['OPENAI_API_KEY'] = 'YOUR API KEY'

In [40]:
# We're just building the two qa models and passing the chunk size and chunk overlap parameters to them
# Notice that we're using their default embedding models, which is generally a good practice since each model 
# has a context length limit

# The part after the arrow is the return type, could be omitted but it's a good practice, and it certainly helps in debugging
def build_qa_chain(chunk_size: int = 1000, chunk_overlap: int = 50) -> RetrievalQA:
    embedding = OpenAIEmbeddings()
    splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    llm = OpenAI(model_name="text-davinci-003",
                temperature=0.9,
                max_tokens=256)
        
    # Building the vectorstore where we will store our vector embedding of our data
    # It is implemented using chromadb by default, which only supports similarity search only and not exact.
    index = VectorstoreIndexCreator(embedding=embedding, text_splitter=splitter).from_loaders(loaders)

    # The agent takes in the user’s query, embeds the query into a vector, retrieves relevant document chunks from  
    # the vector store, sends the relevant document chunks to the LLM and eventually passes the LLM completion to the user.
    return RetrievalQA.from_chain_type(llm=llm, 
                                   chain_type="stuff", 
                                   retriever=index.vectorstore.as_retriever(search_type="similarity",
                                   search_kwargs={"k": 4}),
                                   return_source_documents=True,
                                   input_key="question")

In [41]:
qa_chain = build_qa_chain(chunk_overlap=0)



## Understanding what each parameter is

- llm: defines the LLM model to use.
- retriever: defining from which vector store to retrieve information and by which policy. It has two additional parameters:
- search_type: how to select the chunks from the vector store. It has two types: similarity and MMR. Similarity means selecting the most similar chunks to the query. MMR also does similarity searches. The difference is that MMR will diversify the selected chunks rather than return a very closed result.
- search_kwargs.k: which defines the number of chunks to be selected. In the code piece above, the retriever will use a similarity search to collect 4 candidates.
chain_type: this is specifying how the RetrievalQA should pass the chunks into LLM.
- stuff means inserting the candidate chunks into a single prompt to send to the LLM.
- map_reduce means sending the chunks to LLM in separated batches and comes up with the final answer based on the answers from each batch
- refine means separating texts into batches, feeding the first batch to LLM, and feeding the answer and the second batch to LLM. It refines the answer by going through all the batches.
- map_rerank means separates texts into batches, feeds each batch to LLM, returns a score of how fully it answers the question, and comes up with the final answer based on the highest-scored answers from each batch.
return_source_documents: whether to return the document in the result. Including the documents will be helpful for understanding how the system works.
input_key: the input is a JSON string. The input_key specifies what JSON key is leading the query.

In [None]:
result = qa_chain({'question': 'What is pytorch?', 'include_run_info': True})
print('Q:', result['question'])
print('A:', result['result'])
print('Resources:', result['source_documents'])