In [1]:
%%capture
!pip install langchain_google_genai langchain langchain-community
!pip install langchain chromadb
!pip install langchain_huggingface faiss-cpu
!pip install langchain-text-splitters
!pip install pypdf

In [2]:
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
from langchain_core.messages import SystemMessage,HumanMessage,AIMessage
from google.colab import userdata
import os
from langchain_community.document_loaders import PyPDFLoader
# from langchain-text-splitters import
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings,HuggingFaceEndpoint
from langchain_community.vectorstores import FAISS


In [3]:
# =============== 1. Load PDF =================
pdf_path = "/content/fine_tune_data.pdf"   # change this
loader = PyPDFLoader(pdf_path)
documents = loader.load()


In [4]:
# =============== 2. Split into chunks =========
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

docs = text_splitter.split_documents(documents)


In [5]:
docs[1]

Document(metadata={'producer': 'www.ilovepdf.com', 'creator': 'Microsoft® Word 2016', 'creationdate': '2024-06-30T14:05:50+00:00', 'author': 'Dinesh NC', 'moddate': '2024-06-30T14:05:50+00:00', 'source': '/content/fine_tune_data.pdf', 'total_pages': 17, 'page': 0, 'page_label': '1'}, page_content='women during pregnancy, childbirth, and the postnatal period. \n \n3. Protection from Violence and Exploitation: \no Provision: No woman shall be subjected to physical, mental, sexual, \npsychological, or other forms of violence or exploitation on any grounds, \nincluding religion, social, cultural tradition, practice, or any other grounds. Such \nacts are punishable by law, and the victim has the right to obtain compensation. \no Explanation: This provision protects women from all forms of violence and \nexploitation and ensures that perpetrators are punished by law. It also provides \nvictims the right to seek compensation for the harm they have suffered. \n \n4. Right to Participate in Sta

In [6]:
len(docs)

49

In [8]:
# =============== 3. Free Embeddings ===========
# using a free, fast model
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


In [9]:
# =============== 4. Create / Store Vector DB ===
vector_db = FAISS.from_documents(docs, embeddings)

# Optional: save to disk
# vector_store.save_local('faiss_index', 'my_document_index')
vector_db.save_local("faiss_index")

print("Vector database created and saved!")

Vector database created and saved!


In [10]:
print("Total vectors:", vector_db.index.ntotal)


Total vectors: 49


In [11]:
# Load DB again
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

query = "What is the document about?"
results = new_db.similarity_search(query, k=3)

for r in results:
    print(r.page_content)


women during pregnancy, childbirth, and the postnatal period. 
 
3. Protection from Violence and Exploitation: 
o Provision: No woman shall be subjected to physical, mental, sexual, 
psychological, or other forms of violence or exploitation on any grounds, 
including religion, social, cultural tradition, practice, or any other grounds. Such 
acts are punishable by law, and the victim has the right to obtain compensation. 
o Explanation: This provision protects women from all forms of violence and 
exploitation and ensures that perpetrators are punished by law. It also provides 
victims the right to seek compensation for the harm they have suffered. 
 
4. Right to Participate in State Bodies: 
o Provision: Women have the right to participate in all bodies of the State based on 
the principle of proportional inclusion. 
o Explanation: This ensures that women have the opportunity to be represented in 
various state bodies and decision-making processes, promoting gender equality in
treatme

# **load new pdf & create embedding and merge with first vector**

In [12]:
# --------- 1. Load existing FAISS index ----------
db = FAISS.load_local(
    "faiss_index",
    embeddings,
    allow_dangerous_deserialization=True
)
, index_name='faiss_index'

In [13]:
# --------- 2. Load and split second PDF ----------
pdf_path_2 = "/content/constitution_nowater.pdf"  # change this
loader2 = PyPDFLoader(pdf_path_2)
docs2 = loader2.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
docs2_chunks = text_splitter.split_documents(docs2)


In [14]:
# (Optional but useful) add metadata to know which PDF it came from
for d in docs2_chunks:
    d.metadata["source"] = pdf_path_2


In [15]:
# --------- 3. Add new docs to same vector store ----------
db.add_documents(docs2_chunks)


['86e08e48-e777-4f68-86f1-67d79f1d33ca',
 '2eb358f0-89f8-4e5e-bf36-3c1383153f29',
 '8c5bc73b-9f66-47c8-a7d5-4d96b1e1bd2b',
 '6b214769-ffe2-442f-b757-32235ce7f24f',
 '1e59e1b5-4153-4552-92a9-f2829caf7b7a',
 '16ffb949-bca4-4004-a536-26d334b817bd',
 'a24471a2-696a-4353-923e-41caf7dd98d1',
 '7e1ca6ce-d27d-4d4c-9841-f10fedda1d3a',
 'a2cdada2-c317-441b-8b54-8c684b79a40c',
 '7f85bd39-1fd7-4014-a673-402b1375b072',
 '8df650b1-9808-4ac8-93ea-8de31d6f19e0',
 '6b879196-a141-42ec-8a00-91a7c9e8cab5',
 'a0ab1ad0-729a-4220-8890-b8a546fdf438',
 'd931b685-234e-48b3-ba27-cbe88bc7284b',
 '373fda02-9def-44e0-8feb-d73e35c3ac7d',
 '9d625cb7-96de-4bbf-803a-4600417998e8',
 '18e5f5af-6ab1-4278-9268-e7b95a07709b',
 'd571bc9f-7a88-4167-a1cc-44701e20495b',
 '2828f363-2dd5-4246-bdb6-6b6bab0462cb',
 '06cce9a7-97ea-4837-a804-377ce3815460',
 'e22bb5bf-4c92-4a6c-b679-fd3c6d1a6f74',
 'ff3af263-4fd7-4867-b29f-f6e300c0b5c5',
 '79992177-c01b-4f76-980f-6818e9427dde',
 'ecdd2ca4-c6bc-4dca-9b72-0481bd5495c8',
 'df21617e-c389-

In [16]:
print("Total vectors:", db.index.ntotal)


Total vectors: 523


In [17]:
# --------- 4. Save updated index ----------
db.save_local("faiss_index")

print("Second PDF added to the same FAISS index!")

Second PDF added to the same FAISS index!
