# DIRECTLY LOAD THE VECTOR AND GET THE ANSWER

# Step 1: Environment Setup in Google Colab

In [None]:
!pip install pdfplumber pymupdf pytesseract python-docx
!apt install tesseract-ocr -y

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [None]:
!pip install faiss-cpu chromadb




In [None]:
!pip install transformers sentence-transformers



In [None]:
!pip install langchain



In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import pandas as pd

In [None]:
def chunk_text(text, chunk_size=500, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    chunks = splitter.split_text(text)
    print(f"✅ Total Chunks Created: {len(chunks)}")
    return chunks


In [None]:
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def embed_chunks(chunks):
    embeddings = embedding_model.encode(chunks, show_progress_bar=True)
    print(f"✅ Embeddings Shape: {len(embeddings)} vectors of dimension {len(embeddings[0])}")
    return embeddings


In [None]:
# Step 1: Chunk the extracted text
chunks = chunk_text(extracted_text)

# Step 2: Embed the chunks
embeddings = embed_chunks(chunks)

# Optional: Store chunks + embeddings in a DataFrame for inspection
df_chunks = pd.DataFrame({
    'chunk': chunks,
    'embedding': embeddings.tolist()
})
df_chunks.head()


✅ Total Chunks Created: 150


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

✅ Embeddings Shape: 150 vectors of dimension 384


Unnamed: 0,chunk,embedding
0,5.1 Introduction\nThe name ‘Indus civilization...,"[0.16635042428970337, 0.06399969756603241, 0.1..."
1,"Geographically, however, this civilization (al...","[0.2988309860229492, 0.040255770087242126, 0.1..."
2,"Before looking at its various features, it is ...","[0.28283822536468506, 0.1901068389415741, 0.14..."
3,configurations and organizational devices that...,"[0.1922924518585205, 0.12131597101688385, -0.2..."
4,"on bangles, bronze implements etc.) and possib...","[0.24960875511169434, 0.09737467020750046, -0...."


In [None]:
len(df_chunks)

150

In [None]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.25-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<1.0.0,>=0.3.65 (from langchain-community)
  Downloading langchain_core-0.3.65-py3-none-any.whl.metadata (5.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting langsmith

In [None]:
import faiss
import numpy as np
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
import os


# HEALTH AND DISEASE

In [None]:
# 🧠 Step 1: Imports
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# 📌 Step 2: Load Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 🗂️ Step 3: Load FAISS Vector Store
docsearch = FAISS.load_local(r"..\Embeddings\Health", embedding_model, allow_dangerous_deserialization=True)

# 🦅 Step 4: Load Falcon LLM
model_id = "tiiuae/falcon-rw-1b"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float32)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
    max_new_tokens=256,
    pad_token_id=tokenizer.eos_token_id
)

llm = HuggingFacePipeline(pipeline=generator)

# ✍️ Step 5: Custom Prompt for Focused Answer
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful historian assistant. Based on the context below, provide a clear and informative answer in 5 to 6 lines.

Context:
{context}

Question:
{question}

Helpful Answer:"""
)

# 🔗 Step 6: Create RAG Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=docsearch.as_retriever(),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt_template},
    return_source_documents=False
)

# 🧑‍💻 Step 7: Take Query and Generate Refined Answer
query = input("📚 Ask a Health and Disease-related question: ")
response = qa_chain.run(query)

# 🧹 Optional Step 8: Remove Duplicates if Any
def deduplicate_sentences(text):
    sentences = text.split('. ')
    seen = set()
    unique = []
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence and sentence not in seen:
            unique.append(sentence)
            seen.add(sentence)
    return '. '.join(unique)

clean_response = deduplicate_sentences(response)

# ✅ Final Output
print("\n🧠 Final Refined Answer:\n", clean_response)

Device set to use cuda:0


📚 Ask a Health and Disease-related question: what is python

🧠 Final Refined Answer:
 You are a helpful historian assistant. Based on the context below, provide a clear and informative answer in 5 to 6 lines.

Context:
Abusers also bribe (use chocolates and toys)
21.1.2 Sexual Abuse
to lure children and take advantage of the
Sexual harassment is a form of power child’s innocence.
and dominance of one person over another,
Sexually abused children show symptoms
which can result in harmful consequence to
of genital injury, abdominal pain, frequent
urinary infection and behavioural problems.
More to Know
21.1.4 Approaches for Protection
The Ministry of Women and Child
of an Abused Child

Psychotherapy: Individual and group
counselling is given by psychologists and
counsellors. The treatment includes efforts to
21.3.3 Behavioural Changes of
reduce the addict’s stress, taught new ways to
Drug Users
solve everyday’s problems, adequate diet, rest
Adverse effects of drug use among
and relaxatio

# PHYSICS

In [None]:
# 🧠 Step 1: Imports
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# 📌 Step 2: Load Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 🗂️ Step 3: Load FAISS Vector Store
docsearch = FAISS.load_local(r"..\Embeddings\physics", embedding_model, allow_dangerous_deserialization=True)

# 🦅 Step 4: Load Falcon LLM
model_id = "tiiuae/falcon-rw-1b"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float32)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
    max_new_tokens=256,
    pad_token_id=tokenizer.eos_token_id
)

llm = HuggingFacePipeline(pipeline=generator)

# ✍️ Step 5: Custom Prompt for Focused Answer
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful historian assistant. Based on the context below, provide a clear and informative answer in 5 to 6 lines.

Context:
{context}

Question:
{question}

Helpful Answer:"""
)

# 🔗 Step 6: Create RAG Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=docsearch.as_retriever(),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt_template},
    return_source_documents=False
)

# 🧑‍💻 Step 7: Take Query and Generate Refined Answer
query = input("📚 Ask a Physics-related question: ")
response = qa_chain.run(query)

# 🧹 Optional Step 8: Remove Duplicates if Any
def deduplicate_sentences(text):
    sentences = text.split('. ')
    seen = set()
    unique = []
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence and sentence not in seen:
            unique.append(sentence)
            seen.add(sentence)
    return '. '.join(unique)

clean_response = deduplicate_sentences(response)

# ✅ Final Output
print("\n🧠 Final Refined Answer:\n", clean_response)

Device set to use cuda:0


📚 Ask a Physics-related question: si unit

🧠 Final Refined Answer:
 You are a helpful historian assistant. Based on the context below, provide a clear and informative answer in 5 to 6 lines.

Context:
entities are there are atoms in 0.012 kg of carbon-12.
Rules and conventions for writing SI units and their symbols:
1. The units named after scientist are not written with a capital initial letter.
For example: newton, henry, watt
2. The symbols of the units named after scientist should be written by a capital letter.
For example: N for newton, H for henry, W for watt
3. Small letters are used as symbols for units not derived from a proper nme
For example: m for metre, kg for kilogram

m s-2 and not as kgms-2.
9. Only accepted symbols should be used.
For example: ampere is represented as A and not as amp. or am ; second is
represented as s and not as sec.
10. Numerical value of any physical quantity should be expressed in scientific notation.
For an example density of mercury is 1.36x104