In [None]:
!pip -q install langchain langchain-community langchain-text-splitters
!pip -q install faiss-cpu sentence-transformers pypdf

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.5/2.5 MB[0m [31m119.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, numpy as np
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer, util
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.schema import Document

In [None]:
PDF_PATH = "/content/MOSDAC.pdf"
assert os.path.exists(PDF_PATH),

In [None]:
loader = PyPDFLoader(PDF_PATH)
pages: List[Document] = loader.load()
print("Pages:", len(pages))

Pages: 95


In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)
docs = splitter.split_documents(pages)
print("Chunks:", len(docs))

Chunks: 397


In [None]:
st_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
class STEmbeddings(Embeddings):
    def __init__(self, mdl): self.mdl = mdl
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.mdl.encode(texts, batch_size=64, normalize_embeddings=True).tolist()
    def embed_query(self, text: str) -> List[float]:
        return self.mdl.encode([text], normalize_embeddings=True).tolist()[0]

emb = STEmbeddings(st_model)

In [None]:
vs = FAISS.from_documents(docs, emb)
vs.save_local("mosdac_faiss")
print("✅ FAISS saved at mosdac_faiss")

✅ FAISS saved at mosdac_faiss


In [None]:
import re
def split_sentences(text: str) -> List[str]:
    s = re.split(r'(?<=[.!?])\s+', text.strip())
    # clean and filter very short/noisy
    return [t.strip() for t in s if len(t.strip()) > 15]

def retrieve_chunks(query: str, k: int = 6) -> List[Document]:
    return vs.similarity_search(query, k=k)

def pick_best_sentence(query: str, contexts: List[str]) -> str:
    # Embed query and candidate sentences, pick highest cosine sim
    q_emb = st_model.encode([query], normalize_embeddings=True)
    s_emb = st_model.encode(contexts, normalize_embeddings=True)
    sims = util.cos_sim(q_emb, s_emb).cpu().numpy()[0]
    best_idx = int(np.argmax(sims))
    return contexts[best_idx]

def rag_answer(query: str, k: int = 6) -> Dict[str, Any]:
    retrieved = retrieve_chunks(query, k=k)
    all_sents = []
    citations = []
    for i, d in enumerate(retrieved, 1):
        sents = split_sentences(d.page_content)
        for s in sents:
            all_sents.append((s, i, d.metadata.get("page")))
    if not all_sents:
        return {"answer":"I couldn't find this in the PDF.", "citations":[]}

    best_sent = pick_best_sentence(query, [s for s,_,_ in all_sents])
    # find its citation tuple
    for s, ci, pg in all_sents:
        if s == best_sent:
            citations.append({"chunk_rank": ci, "page": pg})
            break
    return {"answer": best_sent, "citations": citations, "retrieved": len(retrieved)}

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Use strong embedding model for semantic similarity
sim_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

EVAL_SET = [
    {
        "q": "What is INSAT-3DR?",
        "reference": "INSAT-3DR is an advanced meteorological satellite that incorporates an imager, a sounder, a data relay transponder, and a satellite aided search and rescue payload."
    },
    {
        "q": "What are the payloads on INSAT-3DR?",
        "reference": "The payloads on INSAT-3DR are the Imager, the Sounder, the Data Relay Transponder (DRT), and the Satellite Aided Search and Rescue (SAS&R) payloads."
    },
    {
        "q": "What are the objectives of SARAL-AltiKa",
        "reference": "SARAL/AltiKa main scientific objective is to provide data products to oceanographic research user community in studies leading to improve our knowledge of the ocean meso-scale variability"
    },
    {
        "q": "Explain Megha Tropiques",
        "reference": "Megha-Tropiques is an Indo-French Joint Satellite Mission for studying the water cycle and energy exchanges in the tropics."
    },
    {
        "q": "What is Kalpana-1?",
        "reference": "Kalpana-1 is the first dedicated meteorological satellite launched by Indian Space Research Organisation using Polar Satellite Launch Vehicle on 2002-09-12."
    },
    {
        "q": "Give INSAT-3DS Introduction",
        "reference": "INSAT-3DS is a dedicated meteorological spacecraft designed for enhanced meteorological observation and monitoring of land and ocean surfaces of weather forecasting and disaster warning."
    }
]

# --- Semantic similarity scoring ---
def semantic_score(ref, pred):
    emb1 = sim_model.encode(ref, convert_to_tensor=True)
    emb2 = sim_model.encode(pred, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2).item()

scores = []
for item in EVAL_SET:
    q, ref = item["q"], item["reference"]
    pred = rag_answer(q)["answer"]   # <-- bot prediction
    score = semantic_score(ref, pred)
    scores.append(score)
    print(f"\nQ: {q}")
    print(f"Reference: {ref}")
    print(f"Predicted: {pred}")
    print(f"Semantic Similarity: {score:.2f}")

avg_score = np.mean(scores)
print("Final Semantic Evaluation:")
print(f"Average Similarity Score: {avg_score:.2f}")


Q: What is INSAT-3DR?
Reference: INSAT-3DR is an advanced meteorological satellite that incorporates an imager, a sounder, a data relay transponder, and a satellite aided search and rescue payload.
Predicted: INSAT -3DR is a multipurpose geosynchronous spacecraft with main 
meteorological payloads (imager and sounder).
Semantic Similarity: 0.93

Q: What are the payloads on INSAT-3DR?
Reference: The payloads on INSAT-3DR are the Imager, the Sounder, the Data Relay Transponder (DRT), and the Satellite Aided Search and Rescue (SAS&R) payloads.
Predicted: INSAT-3D Payloads 
The satellite has 3 payloads: 
● Meteorological (MET) - IMAGER and SOUNDER 
● Data Relay Transponder (DRT) 
● Satellite Aided Search and Rescue (SAS&R) 
Meteorological Payload 
The INSAT-3D spacecraft incorporates advanced Imager and Sounder instruments.
Semantic Similarity: 0.89

Q: What are the objectives of SARAL-AltiKa
Reference: SARAL/AltiKa main scientific objective is to provide data products to oceanographic re