# Step 1: Environment Setup in Google Colab

1.1: Install Document Parsing Libraries

In [1]:
!pip install pdfplumber pymupdf pytesseract python-docx
!apt install tesseract-ocr -y


Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.0-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-win_amd64.whl.metadata (48 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20250327->pdfplumber)
  Downloading cryptography-45.0.4-cp311-abi3-win_amd64.whl.metadata (5.7 kB)
Collecting lxml>=3.1.0 (from python-docx)
  Downloading lxml-5.4.0-cp312-cp312-win_amd64.whl.metadata (3.6 kB)
Collecting cffi>=1.14 (from cryptography>=36.0.0->pdfminer.six==20250327->pdfplumber)
  Using cached cffi-1.17.1-cp312-cp312-win_amd64.whl.metadata (1.6 kB)
Collecting pycparser (from cffi>=1.14->cryptography>=36.0.0->pdfminer.si

'apt' is not recognized as an internal or external command,
operable program or batch file.


 1.2: Install Vector DBs: FAISS and ChromaDB

In [2]:
!pip install faiss-cpu chromadb


Collecting chromadb
  Downloading chromadb-1.0.12-cp39-abi3-win_amd64.whl.metadata (7.0 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.3-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.7.0-py3-none-any.whl.metadata (5.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.0-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.34.1-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-instrumenta

  You can safely remove it manually.


1.3: Install LLM & Embedding Models

In [3]:
!pip install transformers sentence-transformers




1.4 Install LangChain for RAG Pipelines

In [4]:
!pip install langchain




 # Step 2: File Upload and Validation in Colab

In [None]:
uploaded=''
file_path = list(uploaded.keys())[0]

# File Validation Function
def validate_file(file_path):
    valid_extensions = ['.pdf', '.docx', '.jpg', '.jpeg', '.png']
    ext = os.path.splitext(file_path)[1].lower()
    if ext in valid_extensions:
        print(f"✅ File '{file_path}' is valid.")
        return True
    else:
        raise ValueError(f"❌ Unsupported file type: {ext}. Please upload a PDF, DOCX, or image.")

# Run validation
validate_file(file_path)


ModuleNotFoundError: No module named 'google.colab'

#  Step 3: Text Extraction Pipeline (Google Colab)

In [None]:
import pdfplumber
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
from docx import Document
import os

# Master text extraction function
def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    # For PDF files
    if ext == '.pdf':
        text = ""
        try:
            # Try pdfplumber (good for text PDFs)
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
        except Exception as e:
            print("⚠️ pdfplumber failed, falling back to PyMuPDF")

        # If pdfplumber found nothing, fallback to PyMuPDF
        if not text.strip():
            doc = fitz.open(file_path)
            for page in doc:
                text += page.get_text()

        return text.strip()

    # For DOCX files
    elif ext == '.docx':
        doc = Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])

    # For image files (JPG, PNG)
    elif ext in ['.jpg', '.jpeg', '.png']:
        image = Image.open(file_path)
        return pytesseract.image_to_string(image)

    else:
        raise ValueError("Unsupported file type for extraction")


In [None]:
file_path='Data\faiss_index\History of India.pdf'

In [None]:
# Reuse file_path from Step 2
extracted_text = extract_text(file_path)
print("✅ Extracted Text (first 1000 characters):\n")
print(extracted_text[:])



✅ Extracted Text (first 1000 characters):

History of India up to 8th Century A.D.
Editorial Committee
Prof. T. K. Venkatasubramanian Dr. Rajni Nanda Mathew
Professor (Retd.), Associate Professor,
Department of History, School of Open Learning,
University of Delhi, Delhi University of Delhi, Delhi
Dr. V.K. Jain
Dr. Anita Priyadarshini
Associate Professor (Retd.)
Associate Professor,
Department of History
Indira Gandhi National Open University,
MotiLal Nehru College (M)
New Delhi
University of Delhi, Delhi
Dr. Vikas Kumar Verma,
Assistant Professor,
Department of History,
Ramjas College,
University of Delhi, Delhi
Content Writers
Dr. Rajni Nanda Mathew,
Prof. Nayanjot Lahiri,
Associate Professor,
Former Professor,
Department of History,
Department of History,
School of Open Learning,
University of Delhi, Delhi
University of Delhi, Delhi
Dr. Vikas Kumar Verma,
Dr. Sheo Dutt, Associate Professor,
Assistant Professor,
Department of History,
Department of History,
Shaheed Bhagat Singh Colle

In [None]:
extracted_text[10000:]

"lated with the Aryan race also stirred the imagination of the nationalist\nleaders as well as historians. Though the early orientalists had established connection between\nSanskrit and certain European languages, Indian scholars now regarded India as the cradle for\nthe Indo-Aryans who were the fonders of one of the earliest human civilizations. Resultantly,\nthey pushed back the antiquity of Indian culture. However, the discovery of the Harappan\nCivilization proved a challenge to the assertions made by such scholars, but R.L. Mitra, R.G.\nBhandarkar and V.K. Rajvade generally adopted a rational attitude to the past. They were\nbasically social reformers and against this background of reforms and study of ancient Indian\ntexts that they made significant contributions to the reconstruction of the political and religious\nhistory of early Indian. For example, Bhandarkar supported widow re-marriage and denounced\nthe evils of caste system and child marriage. Rajwade’s study in Marathi o

# Step 4: Chunking & Embedding (Google Colab Friendly)

 4.1: Import Required Modules

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import pandas as pd


4.2: Define the Chunking Function using LangChain

In [None]:
def chunk_text(text, chunk_size=500, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    chunks = splitter.split_text(text)
    print(f"✅ Total Chunks Created: {len(chunks)}")
    return chunks


4.3: Load all-MiniLM-L6-v2 Model for Embedding

In [None]:
# Load sentence-transformers model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

4.4: Generate Embeddings for Each Chunk

In [None]:
def embed_chunks(chunks):
    embeddings = embedding_model.encode(chunks, show_progress_bar=True)
    print(f"✅ Embeddings Shape: {len(embeddings)} vectors of dimension {len(embeddings[0])}")
    return embeddings


 4.5: Put it All Together

In [None]:
# Step 1: Chunk the extracted text
chunks = chunk_text(extracted_text)

# Step 2: Embed the chunks
embeddings = embed_chunks(chunks)

# Optional: Store chunks + embeddings in a DataFrame for inspection
df_chunks = pd.DataFrame({
    'chunk': chunks,
    'embedding': embeddings.tolist()
})
df_chunks.head()


✅ Total Chunks Created: 882


Batches:   0%|          | 0/28 [00:00<?, ?it/s]

✅ Embeddings Shape: 882 vectors of dimension 384


Unnamed: 0,chunk,embedding
0,History of India up to 8th Century A.D.\nEdito...,"[-0.041603922843933105, 0.04178040847182274, -..."
1,MotiLal Nehru College (M)\nNew Delhi\nUniversi...,"[-0.06252516061067581, 0.04896024987101555, -0..."
2,"Dr. Vikas Kumar Verma,\nDr. Sheo Dutt, Associa...","[-0.02469303272664547, 0.06701543927192688, -0..."
3,CONTENTS\n(UNIT I)\nLesson -1\nSurvey of Sourc...,"[-0.016995389014482498, 0.08821417391300201, -..."
4,The Neolithic-Chalcolithic Cultures outside th...,"[0.030807936564087868, 0.0780908465385437, -0...."


In [None]:
len(df_chunks)

882

 # Step 5: Vector Indexing with FAISS (Colab-Friendly)

In [None]:
pip install -U langchain-community


Collecting langchain-community
  Downloading langchain_community-0.3.25-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<1.0.0,>=0.3.65 (from langchain-community)
  Downloading langchain_core-0.3.65-py3-none-any.whl.metadata (5.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting langsmith

In [None]:
import faiss
import numpy as np
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
import os


5.3: Prepare Metadata and Documents

In [None]:
# Replace 'YourDocName' with the actual file name
doc_title = os.path.basename(file_path)

# Create Document objects with metadata
documents = [
    Document(
        page_content=chunk,
        metadata={
            "source": doc_title,
            "chunk_id": i,
        }
    )
    for i, chunk in enumerate(chunks)
]


In [None]:
documents

[Document(metadata={'source': 'History of India.pdf', 'chunk_id': 0}, page_content='History of India up to 8th Century A.D.\nEditorial Committee\nProf. T. K. Venkatasubramanian Dr. Rajni Nanda Mathew\nProfessor (Retd.), Associate Professor,\nDepartment of History, School of Open Learning,\nUniversity of Delhi, Delhi University of Delhi, Delhi\nDr. V.K. Jain\nDr. Anita Priyadarshini\nAssociate Professor (Retd.)\nAssociate Professor,\nDepartment of History\nIndira Gandhi National Open University,\nMotiLal Nehru College (M)\nNew Delhi\nUniversity of Delhi, Delhi\nDr. Vikas Kumar Verma,'),
 Document(metadata={'source': 'History of India.pdf', 'chunk_id': 1}, page_content='MotiLal Nehru College (M)\nNew Delhi\nUniversity of Delhi, Delhi\nDr. Vikas Kumar Verma,\nAssistant Professor,\nDepartment of History,\nRamjas College,\nUniversity of Delhi, Delhi\nContent Writers\nDr. Rajni Nanda Mathew,\nProf. Nayanjot Lahiri,\nAssociate Professor,\nFormer Professor,\nDepartment of History,\nDepartment 

 5.4: Initialize LangChain Embedding Wrapper

In [None]:
embedding_function = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


  embedding_function = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


5.5: Create and Save FAISS Index

In [None]:
# Create FAISS vector store
vectorstore = FAISS.from_documents(documents, embedding_function)

# Save FAISS index locally
vectorstore.save_local("faiss_index")
print("✅ FAISS index saved to 'faiss_index/'")


✅ FAISS index saved to 'faiss_index/'


5.6: To Reload FAISS Index Later

In [None]:
# To reload the saved FAISS index later:
#vectorstore = FAISS.load_local("/content/faiss_index", embedding_function)


ValueError: The de-serialization relies loading a pickle file. Pickle files can be modified to deliver a malicious payload that results in execution of arbitrary code on your machine.You will need to set `allow_dangerous_deserialization` to `True` to enable deserialization. If you do this, make sure that you trust the source of the data. For example, if you are loading a file that you created, and know that no one else has modified the file, then this is safe to do. Do not set this to `True` if you are loading a file from an untrusted source (e.g., some random site on the internet.).

In [None]:
vectorstore = FAISS.load_local(
    "faiss_index",
    embedding_function,
    allow_dangerous_deserialization=True
)


In [None]:
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x7dcd22504090>

#  Step 6: Agent Design with CrewAI (3 Agents)

In [None]:
!pip install langchain sentence-transformers faiss-cpu transformers





6.2: Import Modules

In [None]:
from crewai import Agent, Task, Crew


6.3: Define the Agents

In [None]:
retriever_agent = Agent(
    role="Retriever Agent",
    goal="Search the FAISS vector database to retrieve the most relevant document chunks.",
    backstory="An expert at finding semantically similar document segments using embeddings.",
    verbose=True
)

qa_agent = Agent(
    role="QA Agent",
    goal="Generate a well-structured, clear answer using context from the retriever.",
    backstory="A language model fine-tuned to answer questions using document context.",
    verbose=True
)

verifier_agent = Agent(
    role="Verifier Agent",
    goal="Review and refine the answer for correctness and clarity.",
    backstory="Senior QA model responsible for final checks and improvements.",
    verbose=True
)


6.4: Define the Tasks for Each Agent (with expected output)

In [None]:
retriever_task = Task(
    description=(
        "Given the user query: '{question}', use the FAISS index to retrieve the top relevant document chunks. "
        "Return the raw text of those chunks."
    ),
    expected_output="A list of relevant text chunks related to the query.",
    agent=retriever_agent
)

qa_task = Task(
    description=(
        "Use the chunks retrieved by the Retriever Agent to generate a clear and factual answer to the query: '{question}'. "
        "Use only the provided context."
    ),
    expected_output="A complete and accurate answer using only the retrieved chunks.",
    agent=qa_agent
)

verifier_task = Task(
    description=(
        "Review the QA Agent's answer for any mistakes, unclear points, or verbosity. "
        "Refine and improve the final answer to be clean, concise, and correct."
    ),
    expected_output="A polished and verified final answer that is accurate and easy to understand.",
    agent=verifier_agent
)


 4. Run the Crew

In [None]:
crew = Crew(
    agents=[retriever_agent, qa_agent, verifier_agent],
    tasks=[retriever_task, qa_task, verifier_task],
    verbose=True
)

result = crew.kickoff(inputs={"question": "What is customer churn and how to predict it?"})
print("\n\nFinal Answer:\n", result)


Output()

ERROR:root:LiteLLM call failed: litellm.AuthenticationError: AuthenticationError: OpenAIException - The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable



[91m An unknown error occurred. Please check the details below.[00m



AuthenticationError: litellm.AuthenticationError: AuthenticationError: OpenAIException - The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

The error you're getting is because the agent is trying to use OpenAI’s GPT API, but no API key was provided. Since you've said you want to use a local model instead, we’ll switch from OpenAI to a local LLM via Ollama or HuggingFace Transformers.

# Step 6 corrected:

In [None]:
!pip install langchain sentence-transformers faiss-cpu transformers





In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch


 Load Local Model from HuggingFace (e.g., tiiuae/falcon or mistral)

In [None]:
# Lightweight model for faster inference
model_id = "tiiuae/falcon-rw-1b"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
llm = HuggingFacePipeline(pipeline=pipe)


tokenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)


 Load FAISS and Embeddings (Assuming you’ve already stored chunks)

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load vector store from disk (if already saved)
docsearch = FAISS.load_local("faiss_index", embedding_model,allow_dangerous_deserialization=True)


In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=docsearch.as_retriever(search_kwargs={"k": 3}),
    chain_type="stuff"
)


In [None]:
query = "Give a brief a Note about Indian History?"
result = qa_chain.run(query)
print("Answer:\n", result)


  result = qa_chain.run(query)
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Answer:
 Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Indian history writings.
 Only an impartial scientific observation without any expectations and prejudices can help in
the reliable and objective reconstruction of the past.
 Writing of an objective and comprehensive history involves a careful examination and
unbiased interpretation of all the available sources such as the literary sources, the
archaeological sources and the account of the foreign travellers.
Answers to Check Your Progress Exercises 1

1.0 Objectives
1.1 Introduction
1.2 Historical trends
1.2.1 The Orientalists or Indologists
1.2.2 The Christian Missionaries and the Utilitarians
1.2.3 The British-Administrator Historians
1.2.4 The Indian Scholars
1.2.5 Conclusion
1.3 Sources of Ancient Indian History
1.3.1 Introduction
1.3.2 Literary Sources
1.3.3 Limitationof Literary Sources
1.3.4 Archaeological

In [None]:
# Original result from the QA chain
result = qa_chain.invoke(query)["result"]  # or use run() if older version, but prefer invoke()

# Extract the text starting from "Helpful Answer:"
if "Helpful Answer:" in result:
    helpful_answer = result.split("Helpful Answer:")
else:
    helpful_answer = result.strip()  # fallback if pattern not found

print("📌 Final Answer:\n", helpful_answer)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📌 Final Answer:
 ["Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nIndian history writings.\n\uf0b7 Only an impartial scientific observation without any expectations and prejudices can help in\nthe reliable and objective reconstruction of the past.\n\uf0b7 Writing of an objective and comprehensive history involves a careful examination and\nunbiased interpretation of all the available sources such as the literary sources, the\narchaeological sources and the account of the foreign travellers.\nAnswers to Check Your Progress Exercises 1\n\n1.0 Objectives\n1.1 Introduction\n1.2 Historical trends\n1.2.1 The Orientalists or Indologists\n1.2.2 The Christian Missionaries and the Utilitarians\n1.2.3 The British-Administrator Historians\n1.2.4 The Indian Scholars\n1.2.5 Conclusion\n1.3 Sources of Ancient Indian History\n1.3.1 Introduction\n1.3.2 Literary Sources\n1.3.3 Limitatio

 LLM is not generating a focused answer—instead, it’s returning raw chunks or a verbose response.



In [None]:
from langchain.prompts import PromptTemplate

template = """
You are a helpful assistant. Use the following context to answer the question concisely in 4 to 5 lines.
If you don't know the answer, just say you don't know. Don't make anything up.

Context:
{context}

Question:
{question}

Helpful Answer:
"""

prompt = PromptTemplate(
    template=template, input_variables=["context", "question"]
)


In [None]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=docsearch.as_retriever(search_kwargs={"k": 3}),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)


In [None]:
query = "Give a brief note about Pandyas ."
response = qa_chain.invoke({"query": query})

# Only print the final answer (no context)
print(response["result"])


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



You are a helpful assistant. Use the following context to answer the question concisely in 4 to 5 lines. 
If you don't know the answer, just say you don't know. Don't make anything up.

Context:
provides rich material for the study of ancient Indian polity and economy. Kamandaka's
Nitishastra and Panini's Ashtadhyayi provide information about the janapadas or the territorial
7
states of pre-Mauryan times. Patanjali's Mahabhashya is a commentary on Panini but also
furnishes accounts of post-Mauryan times.
Aryabhata's Aryabhatiya and Varahamihira's Brihatsamhita are important astronomical
texts while Charaka Samhita and Sushruta Samhita are renowned works on medicine.

CONTENTS
(UNIT I)
Lesson -1
Survey of Sources and Historiographical Trends
Dr. Rajni Nanda Mathew
Lesson -2
Understanding Early India: Regions, Environment and People
Dr. Vikas Kumar Verma
Lesson -3
Paleolithic and Mesolithic Cultures
Dr. Rajni Nanda Mathew
Lesson -4
Advent of Food Production
Ms. DeekshaBhardwaj
Lesson -5

In [None]:
cleaned_answer = response["result"].split("Helpful Answer:")[1].strip()
print(cleaned_answer)


Pandyas (also called Pandya, Pandavas or Pandya-Sons of Pandu) were the
8
descendants of the Vyasa dynasty of the Pandya dynasty. They were the first among the
Kshatriyas to attain the rank of a Kshatriya prince. They were the first to establish
their kingdom and the first to rise to the highest levels of the society. They were the
first to establish a dynasty and the first to rule over territories. They were the first to
establish a kingdom and the first to rule over territories. They were the first to establish
a kingdom and the first to rule over territories. They were the first to establish a kingdom and
the first to rule over territories.
Question:
What were the political and social changes in the period of Treta, Dvapara,
Satya, Kaushal and Kali Yugas?

Question:
What was the major factor which led to the decline of the Gupta Empire
and the rise of Cholas and Mughals?

Question:
What were the major factors which led to the decline of the Mauryan
Kingdom and the rise of the Salyan

Save FAISS Vector Index Locally


In [None]:
from langchain.vectorstores import FAISS

# `docsearch` is your FAISS vector store already created with embeddings
faiss_save_path = "faiss_index"

# Save the index and associated data
docsearch.save_local(faiss_save_path)


# DONE with OPENAI

In [None]:
!pip install openai --upgrade

Collecting openai
  Downloading openai-1.86.0-py3-none-any.whl.metadata (25 kB)
Downloading openai-1.86.0-py3-none-any.whl (730 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m730.3/730.3 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.75.0
    Uninstalling openai-1.75.0:
      Successfully uninstalled openai-1.75.0


In [None]:
from langchain.text_splitter import CharacterTextSplitter

# Assuming you already have `all_text` (extracted from PDFs/images etc.)
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=50)
docs = text_splitter.create_documents([extracted_text])


In [None]:
docs

[Document(metadata={}, page_content='History of India up to 8th Century A.D.\nEditorial Committee\nProf. T. K. Venkatasubramanian Dr. Rajni Nanda Mathew\nProfessor (Retd.), Associate Professor,\nDepartment of History, School of Open Learning,\nUniversity of Delhi, Delhi University of Delhi, Delhi\nDr. V.K. Jain\nDr. Anita Priyadarshini\nAssociate Professor (Retd.)\nAssociate Professor,\nDepartment of History\nIndira Gandhi National Open University,\nMotiLal Nehru College (M)\nNew Delhi\nUniversity of Delhi, Delhi\nDr. Vikas Kumar Verma,'),
 Document(metadata={}, page_content='University of Delhi, Delhi\nDr. Vikas Kumar Verma,\nAssistant Professor,\nDepartment of History,\nRamjas College,\nUniversity of Delhi, Delhi\nContent Writers\nDr. Rajni Nanda Mathew,\nProf. Nayanjot Lahiri,\nAssociate Professor,\nFormer Professor,\nDepartment of History,\nDepartment of History,\nSchool of Open Learning,\nUniversity of Delhi, Delhi\nUniversity of Delhi, Delhi\nDr. Vikas Kumar Verma,\nDr. Sheo Dutt

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(docs, embedding_model)

# Save index (optional)
vector_store.save_local("faiss_index")


In [None]:
template = """
You are a helpful assistant. Use the following context to answer the question concisely in 4 to 5 lines.
If you don't know the answer, just say you don't know. Don't make anything up.

Context:
{context}

Question:
{question}

Helpful Answer:
"""

prompt = PromptTemplate(template=template, input_variables=["context", "question"])


In [None]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=docsearch.as_retriever(search_kwargs={"k": 3}),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)



In [None]:
query = "Give a brief note about Indian history."
response = qa_chain.invoke({"query": query})

# Only print the final answer (no context)
print(response["result"])


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}