In [2]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser

from dotenv import load_dotenv

load_dotenv()

True

Step 1: Doc Ingestion 

In [8]:
!pip install -qU langchain-community pymupdf

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader

file_path = "./Advertiseement_CEO & CTO for website.pdf"
loader = PyMuPDFLoader(file_path)
docs = loader.load()

print('no. of pages in PDF =', len(docs))

# print(docs[0])


Text Splitting

In [14]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=30
)

In [16]:
chunks = splitter.split_documents(docs)

print(type(chunks))
print(len(chunks))
print(chunks[0].page_content)

<class 'list'>
40
Agriculture Department, 
Government of Maharashtra
Hutatma Rajguru Chowk, Madam Cama Road, 
Mantralaya, Mumbai 400032
NOTICE: INVITING APPLICATIONS FROM 
INDIVIDUAL PROFESSIONALS
Agriculture Department invites online applications 
for Artificial Intelligence and Agritech Innovation


Embedding Generation and Storing in Vectors

In [17]:
embedding_model = OpenAIEmbeddings(model='text-embedding-3-small')

vector_store = FAISS.from_documents(
    documents=chunks,
    embedding=embedding_model
)

Retrieval

In [18]:
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})

In [19]:
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x72b07e8bc950>, search_kwargs={'k': 3})

Augmentation

In [24]:
prompt = PromptTemplate(template='''
            You are a helpful assistant. Answer ONLY from the provided context.
            If the context is insufficient, just say you don't know.
            {context}
            Question: {question}
            ''',
            input_variables=['context','question'])

Generation

In [37]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace

model = HuggingFaceEndpoint(repo_id='zai-org/GLM-4.5', task='text-generation')

llm = ChatHuggingFace(llm=model)

In [38]:
parser = StrOutputParser()

In [39]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda

In [40]:
def format_docs(retrieved_docs):
    context_text = '\n\n'.join(doc.page_content for doc in retrieved_docs)
    return context_text

In [41]:
parallel_chain = RunnableParallel({
    'question': RunnablePassthrough(),
    'context': retriever | RunnableLambda(format_docs)
})

In [42]:
chain = parallel_chain | prompt | llm | parser 

In [45]:
question = 'For which company the role is open'
chain.invoke(question)

'\nBased on the provided context, the roles are open for the **AI and Agritech Innovation Center**. This Center operates under the **MahaAgri-AI Policy 2025–2029**.\n\nKey details confirming this:\n1.  The context explicitly states the positions are for the "Center under MahaAgri-AI Policy 2025–2029".\n2.  The "Reporting To" line is "Managing Director, **AI and Agritech Innovation Center**".\n3.  The "Supervises" line lists technical leads within the "**AI and Agritech Innovation Center** Stack".\n4.  No specific private company name is mentioned. The "Center" is the entity hiring.'