# A Generic LLM project that can act as the base for future

In [7]:
%pip install -r ../requirements.txt

Collecting pypdf (from -r ../requirements.txt (line 10))
  Downloading pypdf-5.0.1-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-5.0.1-py3-none-any.whl (294 kB)
Installing collected packages: pypdf
Successfully installed pypdf-5.0.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import openai
import langchain
import pinecone 
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI

  from tqdm.autonotebook import tqdm


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import os
## Lets Read the document
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents
doc=read_doc('../Data')
len(doc)

58

In [4]:
## Divide the docs into chunks
### https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.html#
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return doc

In [5]:
documents=chunk_data(docs=doc)
len(documents)

141

_using gemini inplace of openai_

In [6]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI

In [7]:
embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
embeddings

GoogleGenerativeAIEmbeddings(client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x0000014320C3F730>, model='models/embedding-001', task_type=None, google_api_key=SecretStr('**********'), credentials=None, client_options=None, transport=None, request_options=None)

In [8]:
vectors = embeddings.embed_query('How are you?')
print(len(vectors))

768


In [9]:
## Vector search db in pinecone

from pinecone import Pinecone, ServerlessSpec

if not os.getenv("PINECONE_API_KEY"):
    print("GIVE API KEY in .env")

pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)

In [10]:
index = pc.Index('langchain-vector')

In [11]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [13]:
vector_store.add_documents(doc)

['6116fac6-a005-4c49-b82c-66c56142d5f0',
 '3fa032cd-6f9f-4762-891a-94edd4c27602',
 'e1b7e1cd-6e97-4164-b136-4810fd2c26d3',
 '956eb0f9-e65c-4861-899a-94829f704230',
 '9a00d993-9755-44c9-b60c-9086337b5aec',
 'e881ab8f-af89-4693-8f33-ece9cf59bc86',
 'eac07bfb-1e60-4901-bac5-d6a008652011',
 '8d434e58-6337-4515-b42c-0e715e6ab070',
 '33cdc248-51c3-4952-820f-12ce47d32cb7',
 '11074c3b-292f-4c9c-9876-67802bc2f655',
 '24898bc5-64ac-438b-a9cc-925809f19c26',
 '6aa14621-878b-4b16-8bb6-3c57e0e3d129',
 '2931a03c-5963-4976-bb4d-a1a4592d10f4',
 '0c999718-1093-41b8-81f0-fcd7249d2c67',
 '5a83c2bd-f09a-4872-8042-a499d6110a77',
 'd520c3ab-1c02-4da8-8e83-66a9eb399fe5',
 '87b85e9d-b891-48c3-bc4d-fe8c1c5bb96e',
 '66a8289e-5d1e-45ec-b6de-c203336c2ca1',
 'b7914b61-a2fe-442a-97ee-0c2b9472d663',
 'f4ba1c35-e81d-4361-9dab-a3e936c032a2',
 '23aac91e-03b7-46f1-8f8c-4a7b51f985fc',
 '404c8019-a877-455a-b3f2-f208d3d213ae',
 '5ef323e4-4024-4d2d-a3fc-27fcc13d4c0d',
 '0b7511b3-2d32-4fea-9326-cb3cacfb364d',
 '1072d022-c017-

In [14]:
# cosine similarity retrieve results
def retrieve_query(query,k=2):
    matching_results=vector_store.similarity_search(query,k=k)
    return matching_results

In [29]:
## Search answers from VectorDB
def retrieve_docs(query):
    doc_search=retrieve_query(query)
    return doc_search

In [30]:
our_query = "How much the agriculture target will be increased by how many crore?"
docs = retrieve_docs(our_query)
print(docs)

[Document(id='24898bc5-64ac-438b-a9cc-925809f19c26', metadata={'page': 10.0, 'source': '..\\Data\\budget_speech.pdf'}, page_content="7 \n \n \n farmers in contributing to the health of fellow citizens by growing these \n‘Shree Anna’.   \n22. Now to make India a global hub for ' Shree Anna' , the Indian Institute \nof Millet Research, Hyderabad  will be supported as the Centre of Excellence \nfor sharing best practices, research and technologies at the international \nlevel.    \nAgriculture Credit  \n23. The agriculture credit target will be increased  \nto ` 20 lakh crore with focus on animal husbandry, dairy and fisheries.  \nFisheries \n24. We will launch a new sub-scheme of PM Matsya Sampada Yojana \nwith targeted investment of ` 6,000 crore to further enable activities of \nfishermen, fish vendors, and micro & small enterprises, improve value chain \nefficiencies, and expand the market. \nCooperation \n25. For farmers, especially small and marginal farmers, and other \nmarginalise

In [38]:
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
def get_llm_response(query):
    prompt_template = """
    Answer the question from the given context in a very deteailed manner
    Context:\n {context}?\n
    Question: \n{question}\n

    Answer:
    """
    
    llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.5)
    prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])
    chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)
    retrieve_documents = retrieve_docs(query)
    # response = chain.run(input_documents = retrieve_documents, question = query)
    response = chain(
        {"input_documents":docs, "question": query}
        , return_only_outputs=True)
    return response

In [39]:
our_query = "How much the agriculture target will be increased by how many crore?"
llm_response = get_llm_response(our_query)
print(llm_response)

{'output_text': 'The agriculture credit target will be increased  \nto ` 20 lakh crore with focus on animal husbandry, dairy and fisheries.'}
