In [43]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from dotenv import load_dotenv
load_dotenv()
import os

In [44]:
## read document
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

## divide docs into chunks
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return docs

In [45]:
doc=read_doc('documents/')
len(doc)

104

In [46]:
documents=chunk_data(docs=doc)
len(documents)

104

In [47]:
embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
vectors=embeddings.embed_query("What's up")
len(vectors)

1536

In [48]:
from pinecone import Pinecone
## Vector Search DB in Pinecone
pinecone_client=Pinecone(
    api_key=os.environ['PINECONE_API_KEY'],
    environment="gcp-starter"
)
index_name="langchainvector"

In [49]:
from langchain.vectorstores import Pinecone as PineconeDB
index=PineconeDB.from_documents(doc, embeddings, index_name=index_name)

In [50]:
## Cosine Similarity Retrieve Results
def retrieve_query(query, k=2):
    matching_results=index.similarity_search(query,k=k)
    return matching_results

In [51]:
from langchain.chat_models import ChatOpenAI
llm=ChatOpenAI(model_name="gpt-4-0125-preview",temperature=0.5)
chain=load_qa_chain(llm,chain_type="stuff")

In [52]:
## Search answers from VectorDB
def retrieve_answers(query):
    doc_search=retrieve_query(query)
    print(doc_search)
    response=chain.run(input_documents=doc_search,question=query)
    return response

In [53]:
our_query = "What should I do on day 1 week 1 of the full body program?"
answer = retrieve_answers(our_query)
print(answer)

[Document(page_content='FULL BODY PROGRAM\nPROGRAM\nWEEK\n1', metadata={'page': 38.0, 'source': 'documents/Fundamentals_Hypertrophy_Program.pdf'}), Document(page_content='FULL BODY PROGRAM\nPROGRAM\nWEEK\n1', metadata={'page': 38.0, 'source': 'documents/Fundamentals_Hypertrophy_Program.pdf'})]
I don't know.
