In [1]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("defence.pdf")
data = loader.load()  # entire PDF is loaded as a single Document
#data

In [2]:
len(data)

58

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000 , chunk_overlap=200)
docs = text_splitter.split_documents(data)


print("Total number of documents: ",len(docs))

Total number of documents:  108


In [17]:
docs[10]

Document(metadata={'producer': 'Acrobat Distiller 22.0 (Windows)', 'creator': 'Adobe InDesign CC 2017 (Windows)', 'creationdate': '2022-07-13T15:53:59+05:30', 'author': 'admin', 'moddate': '2022-07-13T15:54:50+05:30', 'title': 'A91974ddz_pfxgde_b9o.tmp.pdf', 'source': 'defence.pdf', 'total_pages': 58, 'page': 8, 'page_label': '9'}, page_content='to explain this content as basis for their teaching. Use of references is highly \n  \nrecommended.\n (2) To make learning effective the teachers must encourage students to participate \nactively. Discuss the given activities in the class and ensure that children also\n  present them in a written form.  \n (3) At least once a week, ask children to discuss security issues that appear in the \nnewspapers and magazines - Encourage them to express their individual\n  opinions on such security issues. \n (4) Organize field visits considering the situation. It helps children to develop\n  their leadership qualities, co-operative spirit, communication

In [18]:
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from dotenv import load_dotenv
load_dotenv()


#Get an API key:
# Head to https://ai.google.dev/gemini-api/docs/api-key to generate a Google AI API key. Paste in .env file

# Embedding models: https://python.langchain.com/v0.1/docs/integrations/text_embedding/

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector = embeddings.embed_query("hello, world!")
print(len(vector))
vector[:5] #Inspect the first few dimensions of the embedding
#vector

768


[0.05168594419956207,
 -0.030764883384108543,
 -0.03062233328819275,
 -0.02802734263241291,
 0.01813093200325966]

In [21]:
# Create a Chroma vector store by:
# 1. Taking your list of Document objects (`docs`),
# 2. Embedding each document using the specified Gemini embedding model,
# 3. Storing both documents and their embeddings for later retrieval.
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001")  # 1536-dimensional embedding vector for each document
)


In [22]:
# Convert the Chroma vector store into a retriever:
# - `search_type="similarity"` specifies semantic similarity search
# - `search_kwargs={"k": 10}` returns the top 10 most similar documents - can use cosine similarity or euclidian dist. metrics
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 10}
)

# Use the retriever to fetch documents relevant to the question:
# This runs your query through the vector store and returns up to 10 docs
# whose embeddings best match the question embedding.
retrieved_docs = retriever.invoke("Tell me about relationship of science, technology and engineering")


In [23]:
len(retrieved_docs)

10

In [24]:
#retrieved_docs

In [25]:
print(retrieved_docs[1].page_content)

by using already-existing tools and knowledge. This new-found knowledge may then be used 
by engineers to manufacture new tools and machines such as semiconductors, computers, 
and other forms of advanced technology. In this sense, scientists and engineers may both 
be considered as technologists. Therefore the three fields i.e. Science, Technology and 
Engineering are often considered as one for the purposes of research and development. 
Science is the systematic study of the structure and behaviour of the physical and 
natural world. Technology is the application of practical sciences for Industry or Commerce. 
Technology refers to methods, systems, and devices which are the result of scientific 
knowledge being used for practical purposes. A modern example is the rise of Information 
Technology (IT) which is the combined application of Computer Science and Electronics. 
Engineering is the application of mathematics, as well as scientific, economic, social, and


In [26]:
# Import the chat-based LLM wrapper for Google Gemini
from langchain_google_genai import ChatGoogleGenerativeAI

# Instantiate a chat LLM with specific settings:
# - model="gemini-2.0-flash": use the Gemini 2.0 flash chat model
# - temperature=0.3: control randomness (0 = deterministic, 1 = highly creative)
# - max_tokens=500: cap the response length to 500 tokens
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.3,
    max_tokens=500
)


In [27]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use five sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [28]:
# Create a question-answering chain that uses the language model (LLM) and a custom prompt.
# This "stuff" chain feeds all retrieved documents into the prompt at once (simple concatenation).
question_answer_chain = create_stuff_documents_chain(llm, prompt)

# Create a RAG (Retrieval-Augmented Generation) chain by combining:
# - the retriever (which fetches relevant documents based on a query)
# - the question-answering chain (which uses the LLM to answer questions based on the documents)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [30]:
response = rag_chain.invoke({"input": "Tell me about relationship of science, technology and engineering"})
print(response["answer"])

Technology is often developed from the basic knowledge of science combined with engineering. Science might study the flow of electrons in electrical conductors by using already-existing tools and knowledge. This new-found knowledge may then be used by engineers to manufacture new tools and machines such as semiconductors and computers. Science, technology, and engineering are often considered as one for research and development purposes. Engineering is the application of mathematics, as well as scientific, economic, social, and practical knowledge.
