In [1]:
pip install requests beautifulsoup4 selenium lxml pandas

Collecting selenium
  Downloading selenium-4.34.2-py3-none-any.whl.metadata (7.5 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.34.2-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m21.7 MB/s[0m eta [3

In [2]:
import os
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import re

# --------------------------------------
# Clean and extract meaningful text
# --------------------------------------
def clean_text(html_content):
    soup = BeautifulSoup(html_content, 'lxml')
    for script in soup(["script", "style", "noscript"]):
        script.extract()
    text = soup.get_text(separator=' ', strip=True)
    text = re.sub(r'\s+', ' ', text)
    return text

# --------------------------------------
# Save content to .txt file
# --------------------------------------
def save_text(url, text, folder="extracted_texts"):
    os.makedirs(folder, exist_ok=True)
    filename = urlparse(url).path.replace("/", "_")
    if not filename or filename == "_":
        filename = "home"
    with open(os.path.join(folder, f"{filename}.txt"), "w", encoding="utf-8") as f:
        f.write(f"URL: {url}\n\n{text}")

# --------------------------------------
# Crawl function with Requests
# --------------------------------------
def crawl_static(url, visited, depth=0, max_depth=2):
    if url in visited or depth > max_depth:
        return
    visited.add(url)

    try:
        print(f"[Requests] Fetching: {url}")
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            print(f"Failed to fetch {url}: Status code {response.status_code}")
            return
        text = clean_text(response.text)
        save_text(url, text)
        soup = BeautifulSoup(response.text, 'lxml')

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url} with error: {e}")
        return

    # Recursively crawl other internal links
    base = "{0.scheme}://{0.netloc}".format(urlparse(url))
    for link in soup.find_all('a', href=True):
        href = link['href']
        abs_url = urljoin(base, href)
        if urlparse(abs_url).netloc == urlparse(url).netloc and abs_url.startswith("http"):
            crawl_static(abs_url, visited, depth + 1, max_depth)

# --------------------------------------
# MAIN
# --------------------------------------
if __name__ == "__main__":
    start_url = "https://www.mosdac.gov.in"
    visited = set()
    crawl_static(start_url, visited, max_depth=2)  # You can increase depth for more pages
    print("\n✅ Extraction completed. All texts saved in 'extracted_texts/' folder.")

[Requests] Fetching: https://www.mosdac.gov.in
[Requests] Fetching: https://www.mosdac.gov.in#main-content
[Requests] Fetching: https://www.mosdac.gov.in/internal/registration
[Requests] Fetching: https://www.mosdac.gov.in/internal/uops
[Requests] Fetching: https://www.mosdac.gov.in/internal/logout
[Requests] Fetching: https://www.mosdac.gov.in/
[Requests] Fetching: https://www.mosdac.gov.in/insat-3dr
[Requests] Fetching: https://www.mosdac.gov.in/insat-3d
[Requests] Fetching: https://www.mosdac.gov.in/kalpana-1
[Requests] Fetching: https://www.mosdac.gov.in/insat-3a
[Requests] Fetching: https://www.mosdac.gov.in/megha-tropiques
[Requests] Fetching: https://www.mosdac.gov.in/saral-altika
[Requests] Fetching: https://www.mosdac.gov.in/oceansat-2
[Requests] Fetching: https://www.mosdac.gov.in/oceansat-3
[Requests] Fetching: https://www.mosdac.gov.in/insat-3ds
[Requests] Fetching: https://www.mosdac.gov.in/scatsat-1
[Requests] Fetching: https://www.mosdac.gov.in/internal/catalog-satellite


Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html_content, 'lxml')

Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(response.text, 'lxml')


[Requests] Fetching: https://www.mosdac.gov.in/docs/STQC.pdf
[Requests] Fetching: https://www.mosdac.gov.in/mosdac-feedback
[Requests] Fetching: https://www.mosdac.gov.in/about-us
[Requests] Fetching: https://www.mosdac.gov.in/contact-us
[Requests] Fetching: https://www.mosdac.gov.in/copyright-policy
[Requests] Fetching: https://www.mosdac.gov.in/data-access-policy
[Requests] Fetching: https://www.mosdac.gov.in/hyperlink-policy
[Requests] Fetching: https://www.mosdac.gov.in/privacy-policy
[Requests] Fetching: https://www.mosdac.gov.in/website-policies
[Requests] Fetching: https://www.mosdac.gov.in/terms-conditions
[Requests] Fetching: https://www.mosdac.gov.in/faq-page

✅ Extraction completed. All texts saved in 'extracted_texts/' folder.


In [18]:
pip install langchain faiss-cpu datasets ragas transformers accelerate sentence-transformers gradio

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting ragas
  Downloading ragas-0.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting diskcache>=5.6.3 (from ragas)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ragas-0.3.0-py3-none-any.whl (190 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?2

In [19]:
from pathlib import Path
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_and_chunk(folder="extracted_texts", chunk_size=500, overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    all_chunks = []

    for file_path in Path(folder).glob("*.txt"):
        loader = TextLoader(str(file_path), encoding='utf-8')
        docs = loader.load()
        chunks = splitter.split_documents(docs)
        all_chunks.extend(chunks)

    return all_chunks

docs = load_and_chunk()
print(f"✅ Loaded and chunked {len(docs)} text chunks from {len(list(Path('extracted_texts').glob('*.txt')))} files.")

✅ Loaded and chunked 7160 text chunks from 94 files.


In [20]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embedding_model)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_id = "google/flan-t5-base"  # light model, works on CPU
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
llm = HuggingFacePipeline(pipeline=pipe)

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


In [22]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

In [39]:
query = "What is MOSDAC about?"
response = qa_chain(query)

print("Answer:", response['result'])
for doc in response['source_documents']:
    print("\n Source:", doc.metadata['source'])
    print(doc.page_content[:300])

Answer: Meteorological and Oceanographic Satellite Data Archival Centre (MOSDAC) is a Data Centre of Space Applications Centre (SAC) and has facility for satellite data reception, processing, analysis and dissemination. MOSDAC is operationally supplying earth observation data from Indian meteorology and oceanography satellites, to cater to national and international research requirements. Search Search Follow Us Website owned and maintained by MOSDAC, Space Applications Centre (SAC) and has facility for satellite data reception, processing, analysis and dissemination. MOSDAC is operationally supplying earth observation data from Indian meteorology and oceanography satellites, to cater to national and international research requirements. Contact Us: MOSDAC Operator oprmosdac[at]mosdac[dot]gov[dot]in; +91-79-26916207 MOSDAC Space URL: https://www.mosdac.gov.in/mosdac-feedback URL: https://www.mosdac.gov.in/faq-page

 Source: extracted_texts/_about-us.txt
/ mitigation. Meteorological and 

In [51]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd

# Load model for semantic comparison
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define evaluation dataset (expected answers)
evaluation_data = pd.DataFrame({
    "question": [
        "What is MOSDAC?",
        "How to access satellite data?",
        "What services are provided by MOSDAC?"
    ],
    "expected_answer": [
        "MOSDAC is a data center under ISRO that provides satellite data for meteorology and oceanography.",
        "You can access satellite data by visiting MOSDAC's portal and logging in with your credentials.",
        "MOSDAC provides data access, visualization, downloads, and weather alerts for satellite data."
    ]
})

# Run model to get answers
predicted_answers = []
for q in evaluation_data['question']:
    result = qa_chain(q)
    predicted_answers.append(result['result'])

evaluation_data['predicted_answer'] = predicted_answers

In [52]:
# Compute cosine similarity between expected and predicted answers
similarities = []
for i, row in evaluation_data.iterrows():
    emb_expected = model.encode(row['expected_answer'], convert_to_tensor=True)
    emb_predicted = model.encode(row['predicted_answer'], convert_to_tensor=True)
    sim = util.cos_sim(emb_expected, emb_predicted).item()
    similarities.append(sim)

evaluation_data['semantic_similarity'] = similarities

# Consider similarity > 0.7 as "correct"
threshold = 0.7
evaluation_data['is_correct'] = evaluation_data['semantic_similarity'] > threshold

# Final Accuracy
accuracy = evaluation_data['is_correct'].mean()
print(f"\n✅ Semantic Accuracy (threshold = {threshold}): {accuracy:.2%}")


✅ Semantic Accuracy (threshold = 0.7): 66.67%


In [54]:
for i, row in evaluation_data.iterrows():
    print(f"\n Q{i+1}: {row['question']}")
    print(f"Expected: {row['expected_answer']}")
    print(f"Predicted: {row['predicted_answer']}")
    print(f"Semantic Similarity: {row['semantic_similarity']:.2f}")
    print("✔️ Correct" if row['is_correct'] else "❌ Incorrect")


 Q1: What is MOSDAC?
Expected: MOSDAC is a data center under ISRO that provides satellite data for meteorology and oceanography.
Predicted: Meteorological and Oceanographic Satellite Data Archival Centre (MOSDAC) is a Data Centre of Space Applications Centre (SAC) and has facility for satellite data reception, processing, analysis and dissemination. MOSDAC is operationally supplying earth observation data from Indian meteorology and oceanography satellites, to cater to national and international research requirements. Contact Us: MOSDAC Operator oprmosdac[at]mosdac[dot]gov[dot]in; +91-79-26916207 MOSDAC Space URL: https://www.mosdac.gov.in/ URL: https://www.mosdac.gov.in/
Semantic Similarity: 0.86
✔️ Correct

 Q2: How to access satellite data?
Expected: You can access satellite data by visiting MOSDAC's portal and logging in with your credentials.
Predicted: Automatic download, analysis and visualization of data Currently SCATSAT-1 High Resolution Products are available, soon more pro