In [None]:
!pip -q install langchain langchain-community sentence-transformers faiss-cpu pypdf tqdm spacy networkx transformers accelerate

In [None]:
import sys, subprocess, pkgutil
if not pkgutil.find_loader("en_core_web_sm"):
    subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)

  if not pkgutil.find_loader("en_core_web_sm"):


In [None]:
import os
import re
import json
import networkx as nx
from tqdm import tqdm
import spacy
from typing import List, Tuple, Dict, Any

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

from transformers import pipeline

In [None]:
PDF_CANDIDATES = [
    "/content/MOSDAC.pdf",         # typical Colab upload
    "/mnt/data/MOSDAC.pdf",        # uploaded earlier in this chat session
    "MOSDAC.pdf"
]

pdf_path = None
for p in PDF_CANDIDATES:
    if os.path.exists(p):
        pdf_path = p
        break

if pdf_path is None:
    raise FileNotFoundError(
        "MOSDAC.pdf not found. Upload it to Colab (/content) or place it beside this notebook."
    )

print(f"Using PDF: {pdf_path}")

Using PDF: /content/MOSDAC.pdf


In [None]:
def load_pdf(pdf_path: str) -> List[Document]:
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()  # one Document per page
    return docs

documents = load_pdf(pdf_path)
print(f"✅ Loaded {len(documents)} pages from PDF")

✅ Loaded 95 pages from PDF


In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,        # a bit larger for technical docs
    chunk_overlap=120,     # preserve context
    separators=["\n\n", "\n", ". ", " "]
)
chunks = text_splitter.split_documents(documents)
print(f"✅ Chunked into {len(chunks)} pieces.")

✅ Chunked into 253 pieces.


In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
INDEX_DIR = "/content/mosdac_faiss_index"
if os.path.exists(INDEX_DIR):
    vectorstore = FAISS.load_local(INDEX_DIR, embedding_model, allow_dangerous_deserialization=True)
    print("ℹ️ Loaded existing FAISS index.")
else:
    vectorstore = FAISS.from_documents(chunks, embedding_model)
    vectorstore.save_local(INDEX_DIR)
    print("✅ Built & saved FAISS index.")

✅ Built & saved FAISS index.


In [None]:
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2_000_000

In [None]:
def clean_entity(text: str) -> str:
    # simple cleanup for KG nodes
    t = text.strip()
    t = re.sub(r"\s+", " ", t)
    return t

def extract_triplets_from_sentence(sent: spacy.tokens.span.Span) -> List[Tuple[str, str, str]]:
    # Basic dependency-based SVO extraction (subject, verb, object)
    triplets = []
    root_verb = None
    for token in sent:
        if token.pos_ == "VERB" and token.dep_ in ("ROOT", "conj"):
            root_verb = token
            break

    if not root_verb:
        return triplets

    subj = None
    obj = None

    # subjects attached to the verb (nsubj, nsubjpass)
    for child in root_verb.children:
        if "subj" in child.dep_:
            subj = child

    # objects can be dobj/pobj/attr; also handle compounds
    for child in root_verb.children:
        if child.dep_ in ("dobj", "pobj", "attr", "dative", "oprd"):
            obj = child
            break

    if subj and obj:
        subj_text = " ".join([w.text for w in subj.subtree])
        obj_text  = " ".join([w.text for w in obj.subtree])
        verb_text = root_verb.lemma_
        triplets.append((clean_entity(subj_text), verb_text, clean_entity(obj_text)))

    return triplets

def extract_triplets_from_text(text: str) -> List[Tuple[str, str, str]]:
    doc = nlp(text)
    tri = []
    for sent in doc.sents:
        tri.extend(extract_triplets_from_sentence(sent))
    return tri

# Build KG
G = nx.MultiDiGraph()  # allows multiple labeled edges between same nodes

def add_triplet_to_graph(G, h: str, r: str, t: str, meta: Dict[str, Any] = None):
    h, r, t = clean_entity(h), r.strip(), clean_entity(t)
    if not h or not r or not t:
        return
    if not G.has_node(h):
        G.add_node(h, type="entity")
    if not G.has_node(t):
        G.add_node(t, type="entity")
    G.add_edge(h, t, label=r, meta=meta or {})

# Extract triples from all chunks
all_triplets = []
for i, d in tqdm(enumerate(chunks), total=len(chunks), desc="Extracting triples"):
    text = d.page_content
    meta = d.metadata.copy()
    tris = extract_triplets_from_text(text)
    for (s, v, o) in tris:
        add_triplet_to_graph(G, s, v, o, meta={"chunk_id": i, **meta})
    all_triplets.extend(tris)

print(f"✅ Extracted {len(all_triplets)} triples")
print(f"✅ KG nodes: {G.number_of_nodes()}, edges: {G.number_of_edges()}")

Extracting triples: 100%|██████████| 253/253 [00:08<00:00, 28.49it/s]

✅ Extracted 268 triples
✅ KG nodes: 368, edges: 268





In [None]:
sample_edges = list(G.edges(data=True))[:10]
for u, v, data in sample_edges:
    print(f"  • {u} --[{data.get('label')}]--> {v}")

  • INSAT3DR --[monitor]--> the earth ’s surface , oceanic observations
  • It --[provide]--> Broadcast Satellite Services ( BSS )
  • It --[provide]--> Broadcast Satellite Services ( BSS )
  • It --[provide]--> Broadcast Satellite Services ( BSS )
  • It --[have]--> a Data Relay Transponder and Satellite based Search & Rescue Payload
  • It --[have]--> a Data Relay Transponder and Satellite based Search & Rescue Payload
  • It --[have]--> a Data Relay Transponder and Satellite based Search & Rescue Payload
  • It --[carry]--> twenty four transponders - twelve
  • It --[provide]--> communication , weather and search & rescue services
  • It --[carry]--> three payloads : Ocean Colour Monitor ( OCM ) Ku - band Pencil Beam scatterometer ( SCAT ) developed by ISRO Radio Occultation Sounder for Atmosphere


In [None]:
gen = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    device_map="auto",
    max_new_tokens=256,
    truncation=True
)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [None]:
def semantic_retrieve(query: str, k: int = 6):
    return vectorstore.similarity_search(query, k=k)

def kg_retrieve(query: str, top_k_paths: int = 6) -> List[str]:
    """
    Very simple KG retrieval:
    - find nodes that fuzzily match any significant token from the query
    - collect outgoing/incoming edges as relational facts
    """
    # Extract query nouns/proper nouns as anchor terms
    doc = nlp(query)
    keys = set([w.text for w in doc if w.pos_ in ("NOUN", "PROPN") and len(w.text) > 2])
    if not keys:
        # fallback to all tokens longer than 3 chars
        keys = set([w.text for w in doc if len(w.text) > 3])

    facts = []
    # naive term match on node names
    for node in G.nodes():
        lower_node = node.lower()
        if any(k.lower() in lower_node for k in keys):
            # collect edges around the node
            for _, t, data in G.out_edges(node, data=True):
                facts.append(f"{node} — {data.get('label')} → {t}")
            for s, _, data in G.in_edges(node, data=True):
                facts.append(f"{s} — {data.get('label')} → {node}")

    # de-duplicate, keep up to top_k_paths
    uniq = []
    seen = set()
    for f in facts:
        if f not in seen:
            uniq.append(f)
            seen.add(f)
        if len(uniq) >= top_k_paths:
            break
    return uniq

In [None]:
def build_context(query: str, k_sem: int = 6, k_kg: int = 8) -> Dict[str, Any]:
    sem_docs = semantic_retrieve(query, k=k_sem)
    kg_facts = kg_retrieve(query, top_k_paths=k_kg)

    sem_snippets = []
    for i, d in enumerate(sem_docs):
        meta = d.metadata
        loc = []
        if "source" in meta: loc.append(f"source={meta['source']}")
        if "page" in meta:   loc.append(f"page={meta['page']}")
        loc = " | ".join(loc) if loc else ""
        sem_snippets.append(f"[{i+1}] {d.page_content.strip()}\n{loc}")

    return {"sem_snippets": sem_snippets, "kg_facts": kg_facts}

def make_prompt(query: str, ctx: Dict[str, Any]) -> str:
    # Instruction tuned for grounded answers
    prompt = f"""You are a helpful assistant answering ONLY using the provided MOSDAC context.
If the answer is not clearly in context, say what is known and state any gaps.

Question: {query}

=== KNOWLEDGE GRAPH FACTS ===
{chr(10).join(ctx['kg_facts']) if ctx['kg_facts'] else '(no KG facts found)'}

=== SEMANTIC CONTEXT (Top snippets) ===
{chr(10).join(ctx['sem_snippets']) if ctx['sem_snippets'] else '(no semantic snippets found)'}

Answer clearly, with specifics (names, numbers, bands, payloads, steps) when present.
"""
    return prompt

def answer_query(query: str) -> str:
    ctx = build_context(query)
    prompt = make_prompt(query, ctx)
    out = gen(prompt)[0]["generated_text"]
    return out

In [None]:
sample_questions = [
    "What are the payloads on INSAT-3DR and their purposes?",
    "How is Sea Surface Temperature estimated from INSAT-3D/3DR imagery?",
    "Where is INSAT-3DR positioned and what are its objectives?",
    "What is MOSDAC used for?"
]
for q in sample_questions:
    print("\nQ:", q)
    print("A:", answer_query(q))


Q: What are the payloads on INSAT-3DR and their purposes?
A: I -2K bus with Sounder, Imager and Data Relay Transponder (DRT) and Satellite Aided Search and Rescue (SAS&R) payloads

Q: How is Sea Surface Temperature estimated from INSAT-3D/3DR imagery?
A: It — monitor  the earth ’s surface , oceanic observations INSAT3D — monitor  the earth ’s surface , oceanic observations INSAT3S — monitor  the earth ’s surface , oceanic observations It — have  a Data Relay Transponder and Satellite based Search & Rescue Payload The passive cooler — maintain  the sounder filter wheel temperature The satellite — have  3 payloads :  Meteorological ( MET ) - IMAGER and SOUNDER  Data Relay Transponder ( DRT )  Satellite Aided Search and Rescue Payloads The satellite — have  3 payloads :  Meteorological ( MET ) - IMAGER and SOUNDER  Data Relay Transponder ( DRT )  Satellite Aided Search and Rescue Payloads The satellite — have  3 payloads :  Meteorological ( MET ) - IMAGER and SOUNDER  Data Relay Transpon

In [None]:
def chat():
    print("\n=== MOSDAC KG Chatbot ===")
    print("Ask a question about the PDF. Press Enter on empty line to exit.")
    while True:
        q = input("\nYou: ").strip()
        if not q:
            break
        try:
            a = answer_query(q)
        except Exception as e:
            a = f"Error while answering: {e}"
        print("\nBot:", a)

chat()


=== MOSDAC KG Chatbot ===
Ask a question about the PDF. Press Enter on empty line to exit.

You: What is INSAT-3DR?

Bot: The INSAT-3DR spacecraft — incorporate  advanced Imager and Sounder instruments The INSAT-3DR imager — provide  imaging capability of the earth disc from geostationary altitude 10 bit / sample Downlink data rate 4.0 Mbit / s SOUNDER The INSAT-3DR sounder — have  18 infrared channels distributed over longwave and shortwave bands alongwith one visible band

You: How to access MOSDAC data?

Bot: Satellite data product/in-situ.

You: What are the various satellite products?

Bot: [2] Relay Transponder, Satellite Aided Search and Rescue (SAS&R) Transponder & S - band Broadcast

You: How is Sea Surface Temperature estimated from INSAT imagery?

Bot: The processing of INSAT-3DR data — take  place It — have  a Data Relay Transponder and Satellite based Search & Rescue Payload The passive cooler — maintain  the sounder filter wheel temperature The satellite — have  3 payloa

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

In [None]:
EVAL_SET = [
    {
        "q": "What is INSAT-3DR?",
        "ans": "INSAT-3DR is an advanced meteorological satellite carrying an Imager, Sounder, Data Relay Transponder, and Satellite Aided Search and Rescue payloads."
    },
    {
        "q": "What are the various satellite products?",
        "ans": "Satellite products include data relay transponder, satellite aided search and rescue transponder, and S-band broadcast services."
    },
    {
        "q": "What are the payloads on INSAT-3DR?",
        "ans": "INSAT-3DR carries Imager, Sounder, Data Relay Transponder (DRT), and Satellite Aided Search and Rescue (SAS&R)."
    },
    {
        "q": "What is MOSDAC used for?",
        "ans": "MOSDAC is a data repository for meteorological, oceanographic, and land satellite data, supporting research and applications."
    },
]

In [None]:
sim_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
def semantic_score(gold, pred):
    emb1 = sim_model.encode(gold, convert_to_tensor=True)
    emb2 = sim_model.encode(pred, convert_to_tensor=True)
    sim = util.cos_sim(emb1, emb2).item()
    return sim

scores = []
for item in EVAL_SET:
    q, ans = item["q"], item["ans"]
    pred = answer_query(q)
    score = semantic_score(ans, pred)
    scores.append(score)
    print(f"\nQ: {q}")
    print(f"Actual: {ans}")
    print(f"Pred: {pred}")
    print(f"➡️ Semantic Similarity: {score:.2f}")

avg_score = np.mean(scores)
print("\n📊 Final Semantic Evaluation:")
print(f"Average Semantic Similarity: {avg_score:.2f}")


Q: What is INSAT-3DR?
Actual: INSAT-3DR is an advanced meteorological satellite carrying an Imager, Sounder, Data Relay Transponder, and Satellite Aided Search and Rescue payloads.
Pred: The INSAT-3DR spacecraft — incorporate  advanced Imager and Sounder instruments The INSAT-3DR imager — provide  imaging capability of the earth disc from geostationary altitude 10 bit / sample Downlink data rate 4.0 Mbit / s SOUNDER The INSAT-3DR sounder — have  18 infrared channels distributed over longwave and shortwave bands alongwith one visible band
➡️ Semantic Similarity: 0.76

Q: What are the various satellite products?
Actual: Satellite products include data relay transponder, satellite aided search and rescue transponder, and S-band broadcast services.
Pred: [2] Relay Transponder, Satellite Aided Search and Rescue (SAS&R) Transponder & S - band Broadcast
➡️ Semantic Similarity: 0.79

Q: What are the payloads on INSAT-3DR?
Actual: INSAT-3DR carries Imager, Sounder, Data Relay Transponder (DRT)