In [None]:
pip install requests beautifulsoup4 selenium lxml pandas



In [None]:
import os
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import re

# --------------------------------------
# Clean and extract meaningful text
# --------------------------------------
def clean_text(html_content):
    soup = BeautifulSoup(html_content, 'lxml')
    for script in soup(["script", "style", "noscript"]):
        script.extract()
    text = soup.get_text(separator=' ', strip=True)
    text = re.sub(r'\s+', ' ', text)
    return text

# --------------------------------------
# Save content to .txt file
# --------------------------------------
def save_text(url, text, folder="extracted_texts"):
    os.makedirs(folder, exist_ok=True)
    filename = urlparse(url).path.replace("/", "_")
    if not filename or filename == "_":
        filename = "home"
    with open(os.path.join(folder, f"{filename}.txt"), "w", encoding="utf-8") as f:
        f.write(f"URL: {url}\n\n{text}")

# --------------------------------------
# Crawl function with Requests
# --------------------------------------
def crawl_static(url, visited, depth=0, max_depth=2):
    if url in visited or depth > max_depth:
        return
    visited.add(url)

    try:
        print(f"[Requests] Fetching: {url}")
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            print(f"Failed to fetch {url}: Status code {response.status_code}")
            return
        text = clean_text(response.text)
        save_text(url, text)
        soup = BeautifulSoup(response.text, 'lxml')

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url} with error: {e}")
        return

    # Recursively crawl other internal links
    base = "{0.scheme}://{0.netloc}".format(urlparse(url))
    for link in soup.find_all('a', href=True):
        href = link['href']
        abs_url = urljoin(base, href)
        if urlparse(abs_url).netloc == urlparse(url).netloc and abs_url.startswith("http"):
            crawl_static(abs_url, visited, depth + 1, max_depth)

# --------------------------------------
# MAIN
# --------------------------------------
if __name__ == "__main__":
    start_url = "https://www.mosdac.gov.in"
    visited = set()
    crawl_static(start_url, visited, max_depth=2)  # You can increase depth for more pages
    print("\n✅ Extraction completed. All texts saved in 'extracted_texts/' folder.")

[Requests] Fetching: https://www.mosdac.gov.in
[Requests] Fetching: https://www.mosdac.gov.in#main-content
[Requests] Fetching: https://www.mosdac.gov.in/internal/registration
[Requests] Fetching: https://www.mosdac.gov.in/internal/uops
[Requests] Fetching: https://www.mosdac.gov.in/internal/logout
[Requests] Fetching: https://www.mosdac.gov.in/
[Requests] Fetching: https://www.mosdac.gov.in/insat-3dr
[Requests] Fetching: https://www.mosdac.gov.in/insat-3d
[Requests] Fetching: https://www.mosdac.gov.in/kalpana-1
[Requests] Fetching: https://www.mosdac.gov.in/insat-3a
[Requests] Fetching: https://www.mosdac.gov.in/megha-tropiques
[Requests] Fetching: https://www.mosdac.gov.in/saral-altika
[Requests] Fetching: https://www.mosdac.gov.in/oceansat-2
[Requests] Fetching: https://www.mosdac.gov.in/oceansat-3
[Requests] Fetching: https://www.mosdac.gov.in/insat-3ds
[Requests] Fetching: https://www.mosdac.gov.in/scatsat-1
[Requests] Fetching: https://www.mosdac.gov.in/internal/catalog-satellite


Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html_content, 'lxml')

Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(response.text, 'lxml')


[Requests] Fetching: https://www.mosdac.gov.in/docs/STQC.pdf
[Requests] Fetching: https://www.mosdac.gov.in/mosdac-feedback
[Requests] Fetching: https://www.mosdac.gov.in/about-us
[Requests] Fetching: https://www.mosdac.gov.in/contact-us
[Requests] Fetching: https://www.mosdac.gov.in/copyright-policy
[Requests] Fetching: https://www.mosdac.gov.in/data-access-policy
[Requests] Fetching: https://www.mosdac.gov.in/hyperlink-policy
[Requests] Fetching: https://www.mosdac.gov.in/privacy-policy
[Requests] Fetching: https://www.mosdac.gov.in/website-policies
[Requests] Fetching: https://www.mosdac.gov.in/terms-conditions
[Requests] Fetching: https://www.mosdac.gov.in/faq-page

✅ Extraction completed. All texts saved in 'extracted_texts/' folder.


In [None]:
import os

def load_all_texts(folder="extracted_texts"):
    all_docs = []
    for file in os.listdir(folder):
        if file.endswith(".txt"):
            with open(os.path.join(folder, file), "r", encoding="utf-8") as f:
                all_docs.append(f.read())
    return all_docs

documents = load_all_texts()

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1500000  # Max document length

from tqdm import tqdm

def extract_triplets_from_chunk(chunk):
    doc = nlp(chunk)
    triplets = []
    for sent in doc.sents:
        subj = ""
        obj = ""
        verb = ""
        for token in sent:
            if "subj" in token.dep_:
                subj = token.text
            if "obj" in token.dep_:
                obj = token.text
            if token.pos_ == "VERB":
                verb = token.lemma_
        if subj and obj and verb:
            triplets.append((subj, verb, obj))
    return triplets

# Final list of all triplets
all_triplets = []

# Process in chunks
for doc in tqdm(documents):
    # Split each document into smaller chunks (e.g., 1000 characters)
    chunks = [doc[i:i+1000] for i in range(0, len(doc), 1000)]
    for chunk in chunks:
        try:
            all_triplets.extend(extract_triplets_from_chunk(chunk))
        except Exception as e:
            print("Skipping a chunk due to error:", e)

100%|██████████| 94/94 [05:02<00:00,  3.22s/it]


In [None]:
print(f"Total triplets extracted: {len(all_triplets)}")
print("Sample triplets:", all_triplets[:10])

Total triplets extracted: 1001
Sample triplets: [('Alerts', 'IST', 'NOWCAST'), ('Legends', 'use', 'satellites'), ('alerts', 'use', 'events'), ('Heavyrain', 'attach', 'document'), ('information', 'use', 'interpretation'), ('MOSDAC', 'incur', 'loss'), ('representations', 'mention', 'purpose'), ('we', 'include', 'limitation'), ('effort', 'run', 'website'), ('website', 'take', 'control')]


In [None]:
import networkx as nx

def build_kg(triplets):
    G = nx.DiGraph()
    for subj, rel, obj in triplets:
        G.add_edge(subj, obj, label=rel)
    return G

G = build_kg(all_triplets)

In [None]:
print("Nodes in KG:", G.number_of_nodes())
print("Edges in KG:", G.number_of_edges())

# Optional: list a few
print("Sample edges:", list(G.edges(data=True))[:10])

Nodes in KG: 693
Edges in KG: 660
Sample edges: [('Alerts', 'NOWCAST', {'label': 'IST'}), ('Alerts', 'grid', {'label': 'update'}), ('Legends', 'satellites', {'label': 'use'}), ('alerts', 'events', {'label': 'use'}), ('Heavyrain', 'document', {'label': 'attach'}), ('information', 'interpretation', {'label': 'use'}), ('information', 'development', {'label': 'contain'}), ('information', 'weather', {'label': 'use'}), ('information', 'Chanderet', {'label': 'find'}), ('information', 'pulse', {'label': 'base'})]


In [None]:
def answer_question(question):
    doc = nlp(question)
    subj = None
    verb = None

    for token in doc:
        if "subj" in token.dep_ or token.dep_ == "nsubj":
            subj = token.lemma_.lower()
        elif token.pos_ == "VERB":
            verb = token.lemma_.lower()

    answers = []

    for u, v, d in G.edges(data=True):
        if u.lower() == subj and d['label'].lower() == verb:
            answers.append(v)

    return answers if answers else ["Sorry, I couldn't find the answer."]

In [None]:
def evaluate_qa(test_questions):
    correct = 0
    for q, expected_answers in test_questions:
        pred = answer_question(q)
        match = any(ans.lower() in [p.lower() for p in pred] for ans in expected_answers)

        print(f"\n Question: {q}")
        print(f"   ➤ Expected: {expected_answers}")
        print(f"   ➤ Got: {pred}")
        print(f"   ✅ {'Correct' if match else 'Incorrect'}")

        if match:
            correct += 1

    total = len(test_questions)
    accuracy = correct / total * 100
    print(f"\n📊 QA Accuracy: {accuracy:.2f}%")

In [None]:
test_questions = [
    ("What does information contain?", ["development"]),
    ("What does information use?", ["weather", "interpretation"]),
    ("What does alerts update?", ["grid"]),
    ("What does alerts use?", ["events"]),
    ("What does legends use?", ["satellites"]),
    ("What does heavyrain attach?", ["document"]),
]

In [None]:
evaluate_qa(test_questions)


❓ Question: What does information contain?
   ➤ Expected: ['development']
   ➤ Got: ['development']
   ✅ Correct

❓ Question: What does information use?
   ➤ Expected: ['weather', 'interpretation']
   ➤ Got: ["Sorry, I couldn't find the answer."]
   ✅ Incorrect

❓ Question: What does alerts update?
   ➤ Expected: ['grid']
   ➤ Got: ["Sorry, I couldn't find the answer."]
   ✅ Incorrect

❓ Question: What does alerts use?
   ➤ Expected: ['events']
   ➤ Got: ["Sorry, I couldn't find the answer."]
   ✅ Incorrect

❓ Question: What does legends use?
   ➤ Expected: ['satellites']
   ➤ Got: ["Sorry, I couldn't find the answer."]
   ✅ Incorrect

❓ Question: What does heavyrain attach?
   ➤ Expected: ['document']
   ➤ Got: ['document']
   ✅ Correct

📊 QA Accuracy: 33.33%
