# Langchain Retreival QA With Multiple Files and File Types

In [4]:
import os
from langchain import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader, TextLoader, DirectoryLoader
import yaml

In [3]:
with open(r"..\openai.yaml") as f:
    spec = yaml.safe_load(f)
    key = spec['openai']['key']
    serp_key = spec['serpapi']['key']

os.environ['OPENAI_API_KEY'] = key    
os.environ['SERPAPI_API_KEY'] = serp_key

In [5]:
# !wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip ./docs/
# !unzip -q new_articles.zip new_articles 

'wget' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [59]:
loader = DirectoryLoader("./docs/pdfs", glob="*.pdf", loader_cls=PyPDFLoader)
pdfs = loader.load()
# print(pdfs)
print(len(pdfs))

432


In [8]:
print(type(pdfs[0]))
print(type(pdfs))

<class 'langchain.schema.Document'>
<class 'list'>


In [9]:
print(pdfs[0])

page_content='Attention Is All You Need\nAshish Vaswani\x03\nGoogle Brain\navaswani@google.comNoam Shazeer\x03\nGoogle Brain\nnoam@google.comNiki Parmar\x03\nGoogle Research\nnikip@google.comJakob Uszkoreit\x03\nGoogle Research\nusz@google.com\nLlion Jones\x03\nGoogle Research\nllion@google.comAidan N. Gomez\x03y\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser\x03\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin\x03z\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to\nbe superior in quality while being more parallelizable

In [11]:
txt_loader = DirectoryLoader("./docs/new_articles", glob="*.txt", loader_cls=TextLoader)
txt_docs = txt_loader.load()
print(len(txt_docs))

21


In [12]:
all_docs = pdfs + txt_docs

In [16]:
print(len(all_docs))

453


In [13]:
# Split the texts into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(all_docs)

In [14]:
len(texts)

1295

## Create the Database

In [25]:
# Embed and store the texts
persist_dir = './db'

# using OpneAI embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(texts, embedding, persist_directory=persist_dir)

In [26]:
# persists the db to disk
vectordb.persist()
vectordb = None

In [27]:
vectordb = Chroma(persist_directory=persist_dir, embedding_function=embedding)

## Make a rertrieval QA model

In [29]:
retreiever = vectordb.as_retriever()
docs = retreiever.get_relevant_documents("How much money did Pando raise?")
print(len(docs))

4


In [30]:
retreiever.search_type

'similarity'

## Make a chain

In [32]:
qa_chain = RetrievalQA.from_chain_type(
    llm=OpenAI(temperature=0.0),
    chain_type="stuff",
    retriever=retreiever,
    return_source_documents=True
)

In [51]:
## Cite Sources
def process_llm_responses(llm_response):
    print(llm_response['result'])
    print("\n\nSources:")
    for source in llm_response['source_documents']:
        source_pg = source.metadata.get('source', None)
        page = source.metadata.get('page', None)
        print(f"{source_pg} - {page}")

In [52]:
query = "How much money did Pando raise?"
llm_response = qa_chain(query)
process_llm_responses(llm_response)

 Pando raised $30 million in a Series B round.


Sources:
docs\new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt - None
docs\new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt - None
docs\new_articles\05-07-3one4-capital-driven-by-contrarian-bets-raises-200-million-new-fund.txt - None
docs\new_articles\05-07-fintech-space-continues-to-be-competitive-and-drama-filled.txt - None


In [53]:
# Ask a Spark question
query = "How do I calculate the number of cores on my spark cluster?"
llm_response = qa_chain(query)
process_llm_responses(llm_response)

 You can calculate the number of cores on your Spark cluster by looking at the number of executors and the number of cores per executor.


Sources:
docs\pdfs\LearningSpark2.0.pdf - 37
docs\pdfs\LearningSpark2.0.pdf - 285
docs\pdfs\LearningSpark2.0.pdf - 204
docs\pdfs\LearningSpark2.0.pdf - 203


In [54]:
llm_response["source_documents"][0].metadata

{'source': 'docs\\pdfs\\LearningSpark2.0.pdf', 'page': 37}

In [58]:
# Ask a Spark question
query = "Whats the best way to combine two data sources together in Spark?"
llm_response = qa_chain(query)
process_llm_responses(llm_response)

 The best way to combine two data sources together in Spark is to use the DataFrame-based APIs and domain-specific language (DSL) queries. This allows developers to treat the data as a structured table and issue queries against it as they would a static table.


Sources:
docs\pdfs\LearningSpark2.0.pdf - 38
docs\pdfs\LearningSpark2.0.pdf - 39
docs\pdfs\LearningSpark2.0.pdf - 31
docs\pdfs\LearningSpark2.0.pdf - 290


In [55]:
# Ask a Murph question
query = "How many pushups, situps, pull-ups, and squats should I do everyday to prepare for The Murph?"
llm_response = qa_chain(query)
process_llm_responses(llm_response)

 There is no set number of pushups, situps, pull-ups, and squats you should do everyday to prepare for The Murph. It is best to follow the specific workout program outlined in the Murph training plan.


Sources:
docs\pdfs\Murph-training.pdf - 14
docs\pdfs\Murph-training.pdf - 15
docs\pdfs\Murph-training.pdf - 4
docs\pdfs\Murph-training.pdf - 6


In [56]:
# Ask a Transformer question
query = "How does the Transformer model work compared to other deep learning models?"
llm_response = qa_chain(query)
process_llm_responses(llm_response)

 The Transformer model is based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. It outperforms other models in quality while being more parallelizable and requiring significantly less training cost.


Sources:
docs\pdfs\attention_is_all_you_need.pdf - 1
docs\pdfs\attention_is_all_you_need.pdf - 0
docs\pdfs\attention_is_all_you_need.pdf - 7
docs\pdfs\attention_is_all_you_need.pdf - 2


In [57]:
# Ask a ChatGPT question
query = "Tell me everything about ChatGPT"
llm_response = qa_chain(query)
process_llm_responses(llm_response)



ChatGPT is a general-purpose chatbot that uses artificial intelligence to generate text after a user enters a prompt, developed by tech startup OpenAI. The chatbot uses GPT-4, a large language model that uses deep learning to produce human-like text. It was released on November 30, 2022 and has been regularly updated with new GPT models, the most recent being GPT-4. It can be used for a range of tasks, including programming, TV scripts, scientific concepts, email replies, listicles, blog ideas, summarization, debugging code, complex problem solving, and more. It has an API that was released on March 1, 2023, and users can save their chats.


Sources:
docs\new_articles\05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt - None
docs\new_articles\05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt - None
docs\new_articles\05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt - None
docs\new_articles\05-03-chatgpt-everyth