In [2]:
pip install requests beautifulsoup4 selenium lxml pandas

Collecting selenium
  Downloading selenium-4.34.2-py3-none-any.whl.metadata (7.5 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.34.2-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m45.9 MB/s[0m eta [3

In [3]:
import os
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import re

# --------------------------------------
# Clean and extract meaningful text
# --------------------------------------
def clean_text(html_content):
    soup = BeautifulSoup(html_content, 'lxml')
    for script in soup(["script", "style", "noscript"]):
        script.extract()
    text = soup.get_text(separator=' ', strip=True)
    text = re.sub(r'\s+', ' ', text)
    return text

# --------------------------------------
# Save content to .txt file
# --------------------------------------
def save_text(url, text, folder="extracted_texts"):
    os.makedirs(folder, exist_ok=True)
    filename = urlparse(url).path.replace("/", "_")
    if not filename or filename == "_":
        filename = "home"
    with open(os.path.join(folder, f"{filename}.txt"), "w", encoding="utf-8") as f:
        f.write(f"URL: {url}\n\n{text}")

# --------------------------------------
# Crawl function with Requests
# --------------------------------------
def crawl_static(url, visited, depth=0, max_depth=2):
    if url in visited or depth > max_depth:
        return
    visited.add(url)

    try:
        print(f"[Requests] Fetching: {url}")
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            print(f"Failed to fetch {url}: Status code {response.status_code}")
            return
        text = clean_text(response.text)
        save_text(url, text)
        soup = BeautifulSoup(response.text, 'lxml')

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url} with error: {e}")
        return

    # Recursively crawl other internal links
    base = "{0.scheme}://{0.netloc}".format(urlparse(url))
    for link in soup.find_all('a', href=True):
        href = link['href']
        abs_url = urljoin(base, href)
        if urlparse(abs_url).netloc == urlparse(url).netloc and abs_url.startswith("http"):
            crawl_static(abs_url, visited, depth + 1, max_depth)

# --------------------------------------
# MAIN
# --------------------------------------
if __name__ == "__main__":
    start_url = "https://www.mosdac.gov.in"
    visited = set()
    crawl_static(start_url, visited, max_depth=2)  # You can increase depth for more pages
    print("\n✅ Extraction completed. All texts saved in 'extracted_texts/' folder.")

[Requests] Fetching: https://www.mosdac.gov.in
[Requests] Fetching: https://www.mosdac.gov.in#main-content
[Requests] Fetching: https://www.mosdac.gov.in/internal/registration
[Requests] Fetching: https://www.mosdac.gov.in/internal/uops
[Requests] Fetching: https://www.mosdac.gov.in/internal/logout
[Requests] Fetching: https://www.mosdac.gov.in/
[Requests] Fetching: https://www.mosdac.gov.in/insat-3dr
[Requests] Fetching: https://www.mosdac.gov.in/insat-3d
[Requests] Fetching: https://www.mosdac.gov.in/kalpana-1
[Requests] Fetching: https://www.mosdac.gov.in/insat-3a
[Requests] Fetching: https://www.mosdac.gov.in/megha-tropiques
[Requests] Fetching: https://www.mosdac.gov.in/saral-altika
[Requests] Fetching: https://www.mosdac.gov.in/oceansat-2
[Requests] Fetching: https://www.mosdac.gov.in/oceansat-3
[Requests] Fetching: https://www.mosdac.gov.in/insat-3ds
[Requests] Fetching: https://www.mosdac.gov.in/scatsat-1
[Requests] Fetching: https://www.mosdac.gov.in/internal/catalog-satellite


Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html_content, 'lxml')

Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(response.text, 'lxml')


[Requests] Fetching: https://www.mosdac.gov.in/docs/STQC.pdf
[Requests] Fetching: https://www.mosdac.gov.in/mosdac-feedback
[Requests] Fetching: https://www.mosdac.gov.in/about-us
[Requests] Fetching: https://www.mosdac.gov.in/contact-us
[Requests] Fetching: https://www.mosdac.gov.in/copyright-policy
[Requests] Fetching: https://www.mosdac.gov.in/data-access-policy
[Requests] Fetching: https://www.mosdac.gov.in/hyperlink-policy
[Requests] Fetching: https://www.mosdac.gov.in/privacy-policy
[Requests] Fetching: https://www.mosdac.gov.in/website-policies
[Requests] Fetching: https://www.mosdac.gov.in/terms-conditions
[Requests] Fetching: https://www.mosdac.gov.in/faq-page

✅ Extraction completed. All texts saved in 'extracted_texts/' folder.


In [6]:
import os
import spacy
import pandas as pd
from bs4 import BeautifulSoup

# Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")

# Input folders
text_folder = "extracted_texts"

# Store all extracted triples
triples = []

# --------------------- Extract Clean Text from HTML --------------------- #
def extract_text_from_html(filepath):
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f.read(), "lxml")
            for tag in soup(["script", "style", "noscript"]):
                tag.extract()
            return soup.get_text(separator=' ', strip=True)
    except Exception as e:
        print(f"❌ Error reading {filepath}: {e}")
        return ""

# --------------------- Extract Triples Using spaCy --------------------- #
def extract_triples_from_text(text):
    doc = nlp(text)
    for sent in doc.sents:
        subject = predicate = obj = ""
        for token in sent:
            if "subj" in token.dep_:
                subject = token.text
            elif token.dep_ == "ROOT":
                predicate = token.lemma_
            elif "obj" in token.dep_:
                obj = token.text
        if subject and predicate and obj:
            yield (subject, predicate, obj)

# --------------------- Process All Text Files --------------------- #
def process_text_files(folder_path):
    print(f"🔍 Processing text files in {folder_path}")
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            try:
                with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as f:
                    text = f.read()
                    for triple in extract_triples_from_text(text[:3000]):  # Limit per file
                        yield triple
            except Exception as e:
                print(f"⚠️ Skipped {filename}: {e}")

# --------------------- Process All HTML Files --------------------- #
def process_html_files(folder_path):
    print(f"🔍 Processing HTML files in {folder_path}")
    for filename in os.listdir(folder_path):
        if filename.endswith(".html"):
            filepath = os.path.join(folder_path, filename)
            text = extract_text_from_html(filepath)
            for triple in extract_triples_from_text(text[:3000]):  # Limit per file
                yield triple

# --------------------- Main Execution --------------------- #
if __name__ == "__main__":
    seen_triples = set()

    for triple in process_text_files(text_folder):
        if triple not in seen_triples:
            triples.append(triple)
            seen_triples.add(triple)

    # Save to CSV
    df = pd.DataFrame(triples, columns=["Subject", "Predicate", "Object"])
    df.to_csv("knowledge_graph_triples.csv", index=False)
    print(f"\n✅ Saved {len(triples)} unique triples to 'knowledge_graph_triples.csv'")

🔍 Processing text files in extracted_texts

✅ Saved 352 unique triples to 'knowledge_graph_triples.csv'


In [7]:
import pandas as pd

# Load your triples
df = pd.read_csv("knowledge_graph_triples.csv")

# Create searchable sentences from triples
df["sentence"] = df["Subject"] + " " + df["Predicate"] + " " + df["Object"]
df.to_csv("kg_sentences.csv", index=False)
print("✅ Created searchable sentence dataset: 'kg_sentences.csv'")

✅ Created searchable sentence dataset: 'kg_sentences.csv'


In [8]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m71.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [9]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load pre-trained semantic model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load sentences
df = pd.read_csv("kg_sentences.csv")
sentences = df["sentence"].tolist()

# Encode all sentences
embeddings = model.encode(sentences, convert_to_numpy=True)

# Create FAISS index for fast similarity search
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print("✅ Semantic index created with FAISS.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Semantic index created with FAISS.


In [10]:
def get_top_k_matches(query, k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, k)
    results = []
    for i in indices[0]:
        results.append(df.iloc[i]["sentence"])
    return results

# Example Query
query = "Which satellite gives rainfall data?"
results = get_top_k_matches(query)

print("🔍 Top Matches:")
for r in results:
    print("-", r)

🔍 Top Matches:
- satellite commission data
- information be weather
- Bayesian base rainfall


In [11]:
import pandas as pd

# Load the CSV containing Subject, Predicate, Object, and Sentence
df = pd.read_csv("kg_sentences.csv")

# If 'sentence' column doesn't exist, create it from Subject–Predicate–Object
if "sentence" not in df.columns:
    df["sentence"] = df["Subject"] + " " + df["Predicate"] + " " + df["Object"]

In [12]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load a compact sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode all sentences into vectors
sentences = df["sentence"].tolist()
embeddings = model.encode(sentences, convert_to_numpy=True)

# Create a FAISS index (flat L2 distance)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

print(f"✅ Knowledge base built with {len(sentences)} entries.")

✅ Knowledge base built with 352 entries.


In [13]:
def search_knowledge_graph(query, k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, k)
    results = []
    for i in indices[0]:
        results.append(df.iloc[i]["sentence"])
    return results

In [14]:
query = "How can I access satellite rainfall data?"
answers = search_knowledge_graph(query, k=3)

print(f"🔍 Query: {query}\n💡 Top Matches:")
for ans in answers:
    print("→", ans)

🔍 Query: How can I access satellite rainfall data?
💡 Top Matches:
→ satellite commission data
→ information be weather
→ Bayesian base rainfall
