# PDF QA Bot using OpenAI, FAISS, Langchain

* The program uses Langchain's text splitter to split the pdf into chunks of data.
* These Chunks are embedded using an embedding model from Huggingface.
* The vectors are then stored using FAISS.
* We then take an input question from the user.
* The program uses vector similarity search to find the most relevant chunk of the pdf to the user's question.
* This chunk is sent to the LLM (OpenAI's GPT-3) along with the user's question.
* The LLM then generates an appropriate answer!

In [None]:
!pip install -q streamlit PyPDF2 python-dotenv faiss-cpu langchain altair openai tiktoken sentence_transformers

In [None]:
# Huggingface Embeddings
# OpenAI LLM
# FAISS Vectorstore

In [None]:
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback

In [None]:
load_dotenv()

# get OpenAI API key
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_KEY_HERE"
name = os.environ["OPENAI_API_KEY"]
if(name):
  print("OpenAI key has been entered!")

In [None]:
# upload file
pdf_path = "./path/to/pdf"

In [None]:
# extract the text from the pdf
pdf_reader = PdfReader(pdf_path)
text = ""
for page in pdf_reader.pages:
  text += page.extract_text()

In [None]:
# split into chunks
text_splitter = CharacterTextSplitter(
  separator="\n",
  chunk_size=1000,
  chunk_overlap=200,
  length_function=len
)

In [None]:
chunks = text_splitter.split_text(text)

In [None]:
# define embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # Embeddings model importedfrom Huggingface

In [None]:
# selecting LLM
llm = OpenAI() # by default -> GPT-3 davinci

In [None]:
knowledge_base = FAISS.from_texts(chunks, embedding_function)

In [None]:
# get user input
user_question = "YOUR_QUESTION_HERE"

In [None]:
docs = knowledge_base.similarity_search(user_question)

In [None]:
chain = load_qa_chain(llm, chain_type="stuff")
with get_openai_callback() as cb:
  response = chain.run(input_documents=docs, question=user_question)
  print(cb)

print(response)