In [None]:
pip install requests beautifulsoup4 selenium lxml pandas

Collecting selenium
  Downloading selenium-4.34.2-py3-none-any.whl.metadata (7.5 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.34.2-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m95.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m31.3 MB/s[0m eta [3

In [None]:
import os
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import re

# --------------------------------------
# Clean and extract meaningful text
# --------------------------------------
def clean_text(html_content):
    soup = BeautifulSoup(html_content, 'lxml')
    for script in soup(["script", "style", "noscript"]):
        script.extract()
    text = soup.get_text(separator=' ', strip=True)
    text = re.sub(r'\s+', ' ', text)
    return text

# --------------------------------------
# Save content to .txt file
# --------------------------------------
def save_text(url, text, folder="extracted_texts"):
    os.makedirs(folder, exist_ok=True)
    filename = urlparse(url).path.replace("/", "_")
    if not filename or filename == "_":
        filename = "home"
    with open(os.path.join(folder, f"{filename}.txt"), "w", encoding="utf-8") as f:
        f.write(f"URL: {url}\n\n{text}")

# --------------------------------------
# Crawl function with Requests
# --------------------------------------
def crawl_static(url, visited, depth=0, max_depth=2):
    if url in visited or depth > max_depth:
        return
    visited.add(url)

    try:
        print(f"[Requests] Fetching: {url}")
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            print(f"Failed to fetch {url}: Status code {response.status_code}")
            return
        text = clean_text(response.text)
        save_text(url, text)
        soup = BeautifulSoup(response.text, 'lxml')

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url} with error: {e}")
        return

    # Recursively crawl other internal links
    base = "{0.scheme}://{0.netloc}".format(urlparse(url))
    for link in soup.find_all('a', href=True):
        href = link['href']
        abs_url = urljoin(base, href)
        if urlparse(abs_url).netloc == urlparse(url).netloc and abs_url.startswith("http"):
            crawl_static(abs_url, visited, depth + 1, max_depth)

# --------------------------------------
# MAIN
# --------------------------------------
if __name__ == "__main__":
    start_url = "https://www.mosdac.gov.in"
    visited = set()
    crawl_static(start_url, visited, max_depth=2)  # You can increase depth for more pages
    print("\n✅ Extraction completed. All texts saved in 'extracted_texts/' folder.")

[Requests] Fetching: https://www.mosdac.gov.in
[Requests] Fetching: https://www.mosdac.gov.in#main-content
[Requests] Fetching: https://www.mosdac.gov.in/internal/registration
[Requests] Fetching: https://www.mosdac.gov.in/internal/uops
[Requests] Fetching: https://www.mosdac.gov.in/internal/logout
[Requests] Fetching: https://www.mosdac.gov.in/
[Requests] Fetching: https://www.mosdac.gov.in/insat-3dr
[Requests] Fetching: https://www.mosdac.gov.in/insat-3d
[Requests] Fetching: https://www.mosdac.gov.in/kalpana-1
[Requests] Fetching: https://www.mosdac.gov.in/insat-3a
[Requests] Fetching: https://www.mosdac.gov.in/megha-tropiques
[Requests] Fetching: https://www.mosdac.gov.in/saral-altika
[Requests] Fetching: https://www.mosdac.gov.in/oceansat-2
[Requests] Fetching: https://www.mosdac.gov.in/oceansat-3
[Requests] Fetching: https://www.mosdac.gov.in/insat-3ds
[Requests] Fetching: https://www.mosdac.gov.in/scatsat-1
[Requests] Fetching: https://www.mosdac.gov.in/internal/catalog-satellite


Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html_content, 'lxml')

Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(response.text, 'lxml')


[Requests] Fetching: https://www.mosdac.gov.in/docs/STQC.pdf
[Requests] Fetching: https://www.mosdac.gov.in/mosdac-feedback
[Requests] Fetching: https://www.mosdac.gov.in/about-us
[Requests] Fetching: https://www.mosdac.gov.in/contact-us
[Requests] Fetching: https://www.mosdac.gov.in/copyright-policy
[Requests] Fetching: https://www.mosdac.gov.in/data-access-policy
[Requests] Fetching: https://www.mosdac.gov.in/hyperlink-policy
[Requests] Fetching: https://www.mosdac.gov.in/privacy-policy
[Requests] Fetching: https://www.mosdac.gov.in/website-policies
[Requests] Fetching: https://www.mosdac.gov.in/terms-conditions
[Requests] Fetching: https://www.mosdac.gov.in/faq-page

✅ Extraction completed. All texts saved in 'extracted_texts/' folder.


In [None]:
pip install sentence-transformers langchain faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [None]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 k

In [None]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS # Updated import
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

# STEP 1: Load all .txt files
def load_text_from_folder(folder_path):
    docs = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                content = f.read()
                docs.append(Document(page_content=content, metadata={"source": filename}))
    return docs

folder_path = "extracted_texts"
documents = load_text_from_folder(folder_path)
print(f"✅ Loaded {len(documents)} text documents.")

✅ Loaded 94 text documents.


In [None]:
# Use LangChain's chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

chunks = text_splitter.split_documents(documents)
print(f"✅ Chunked into {len(chunks)} smaller pieces.")

✅ Chunked into 7160 smaller pieces.


In [None]:
# Load SentenceTransformer model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create vector DB using FAISS
vectorstore = FAISS.from_documents(chunks, embedding_model)

# Save index for future use
vectorstore.save_local("mosdac_faiss_index")
print("✅ FAISS index saved as 'mosdac_faiss_index'")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ FAISS index saved as 'mosdac_faiss_index'


In [None]:
# Load index
vectorstore = FAISS.load_local("mosdac_faiss_index", embedding_model, allow_dangerous_deserialization=True)

# Example search
query = "How can I register in MOSDAC?"
results = vectorstore.similarity_search(query, k=3)

print("🔍 Retrieved Contexts:")
for i, doc in enumerate(results):
    print(f"{i+1}:")
    print(doc.page_content)

🔍 Retrieved Contexts:
1:
URL: https://www.mosdac.gov.in/internal/registration
2:
Satellite Data Archival Center. It is a ISRO data portal which provides data through its web based service https://mosdac.gov.in How to be a registered user of MOSDAC? There is "SignUp" form available on MOSDAC portal. Pl fill up the form and submit. You will be intimated through e-mail about the approval. I have registered on MOSDAC . i have received an email for email verification. But when I click the link, I get error message. Please copy the Email verification Hyperlink sent on your
3:
URL: https://www.mosdac.gov.in/faq-page
