In [None]:
!pip install -q sentence-transformers
!pip install -q faiss-cpu
!pip install -q scikit-learn
!pip install -q nltk
!pip install -q beautifulsoup4
!pip install -q requests
!pip install -q lxml

print("✅ All packages installed successfully!")

✅ All packages installed successfully!


In [None]:
import os
import json
import re
import time
import requests
import numpy as np
import pickle
from typing import List, Dict, Tuple, Optional
from datetime import datetime
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup

In [None]:
pip install neo4j langchain openai sentence-transformers faiss-cpu

Collecting neo4j
  Downloading neo4j-5.28.1-py3-none-any.whl.metadata (5.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from t

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

print("✅ All libraries imported and NLTK data downloaded!")

✅ All libraries imported and NLTK data downloaded!


In [None]:
def clean_text(html_content):
    soup = BeautifulSoup(html_content, 'lxml')
    for script in soup(["script", "style", "noscript"]):
        script.extract()
    text = soup.get_text(separator=' ', strip=True)
    text = re.sub(r'\s+', ' ', text)
    return text

def save_text(url, text, folder="extracted_texts"):
    os.makedirs(folder, exist_ok=True)
    filename = urlparse(url).path.replace("/", "_")
    if not filename or filename == "_":
        filename = "home"
    with open(os.path.join(folder, f"{filename}.txt"), "w", encoding="utf-8") as f:
        f.write(f"URL: {url}\n\n{text}")

def crawl_static(url, visited, depth=0, max_depth=2):
    if url in visited or depth > max_depth:
        return
    visited.add(url)

    try:
        print(f"[Requests] Fetching: {url}")
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            print(f"Failed to fetch {url}: Status code {response.status_code}")
            return
        text = clean_text(response.text)
        save_text(url, text)
        soup = BeautifulSoup(response.text, 'lxml')

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url} with error: {e}")
        return

    # Recursively crawl other internal links
    base = "{0.scheme}://{0.netloc}".format(urlparse(url))
    for link in soup.find_all('a', href=True):
        href = link['href']
        abs_url = urljoin(base, href)
        if urlparse(abs_url).netloc == urlparse(url).netloc and abs_url.startswith("http"):
            crawl_static(abs_url, visited, depth + 1, max_depth)

print("✅ Web scraping functions defined!")

if os.path.exists("extracted_texts"):
    files = os.listdir("extracted_texts")
    print(f"✅ Found {len(files)} files in extracted_texts folder")
else:
    print("❌ extracted_texts folder not found. Please run web scraping first.")

✅ Web scraping functions defined!
✅ Found 94 files in extracted_texts folder


In [None]:
class TextProcessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text: str) -> str:
        """Enhanced text cleaning"""
        # Remove extra whitespace and normalize
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\.\,\!\?\-]', '', text)
        return text.strip()

    def chunk_text(self, text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
        """Split text into overlapping chunks for better retrieval"""
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            if len(current_chunk) + len(sentence) <= chunk_size:
                current_chunk += " " + sentence
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence

        if current_chunk:
            chunks.append(current_chunk.strip())

        # Filter out very short chunks
        return [chunk for chunk in chunks if len(chunk) > 50]

    def preprocess_query(self, query: str) -> str:
        """Preprocess user query for better matching"""
        query = query.lower()
        words = word_tokenize(query)
        words = [self.lemmatizer.lemmatize(word) for word in words
                if word not in self.stop_words and word.isalpha()]
        return " ".join(words)

# Initialize text processor
text_processor = TextProcessor()
print("✅ Text processor initialized!")

✅ Text processor initialized!


In [None]:
def load_and_process_documents(folder_path="extracted_texts"):
    """Load all documents and create chunks"""
    documents = []
    all_chunks = []
    chunk_metadata = []

    if not os.path.exists(folder_path):
        print(f"❌ Folder {folder_path} not found!")
        return documents, all_chunks, chunk_metadata

    print(f"Loading and processing documents from {folder_path}...")

    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            filepath = os.path.join(folder_path, filename)

            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    content = f.read()

                # Extract URL and text
                lines = content.split('\n\n', 1)
                url = lines[0].replace('URL: ', '') if lines[0].startswith('URL: ') else filename
                text = lines[1] if len(lines) > 1 else content

                # Clean text
                cleaned_text = text_processor.clean_text(text)

                if len(cleaned_text) > 100:  # Only keep substantial content
                    # Create chunks
                    chunks = text_processor.chunk_text(cleaned_text)

                    doc_info = {
                        'url': url,
                        'filename': filename,
                        'text': cleaned_text,
                        'title': filename.replace('.txt', '').replace('_', ' ').title(),
                        'chunks': chunks
                    }

                    documents.append(doc_info)

                    # Add chunks with metadata
                    for chunk in chunks:
                        all_chunks.append(chunk)
                        chunk_metadata.append({
                            'url': url,
                            'title': doc_info['title'],
                            'filename': filename
                        })

            except Exception as e:
                print(f"Error processing {filename}: {e}")

    print(f"✅ Processed {len(documents)} documents into {len(all_chunks)} chunks")
    return documents, all_chunks, chunk_metadata

In [None]:
documents, all_chunks, chunk_metadata = load_and_process_documents()

Loading and processing documents from extracted_texts...
✅ Processed 82 documents into 2227 chunks


In [None]:
class VectorDatabase:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        print("Loading SentenceTransformer model...")
        self.model = SentenceTransformer(model_name)
        self.chunks = []
        self.metadata = []
        self.embeddings = None
        self.index = None
        print("✅ Model loaded successfully!")

    def build_index(self, chunks: List[str], metadata: List[Dict]):
        """Build FAISS index from chunks"""
        self.chunks = chunks
        self.metadata = metadata

        if not chunks:
            print("❌ No chunks to process!")
            return

        print(f"Creating embeddings for {len(chunks)} chunks...")
        # Create embeddings with progress bar
        self.embeddings = self.model.encode(chunks, show_progress_bar=True)

        # Create FAISS index for fast similarity search
        dimension = self.embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity

        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(self.embeddings)
        self.index.add(self.embeddings)

        print(f"✅ FAISS index created with {len(chunks)} vectors (dimension: {dimension})")

    def search(self, query: str, k: int = 5, threshold: float = 0.6) -> List[Tuple[str, float, Dict]]:
        """Search for similar chunks"""
        if not self.index:
            print("❌ Index not built yet!")
            return []

        # Create query embedding
        query_embedding = self.model.encode([query])
        faiss.normalize_L2(query_embedding)

        # Search
        scores, indices = self.index.search(query_embedding, k)

        results = []
        for score, idx in zip(scores[0], indices[0]):
            if score >= threshold:
                results.append((
                    self.chunks[idx],
                    float(score),
                    self.metadata[idx]
                ))

        return results

    def save_index(self, path="vector_db"):
        """Save the vector database"""
        os.makedirs(path, exist_ok=True)

        if self.index:
            faiss.write_index(self.index, os.path.join(path, "index.faiss"))

            with open(os.path.join(path, "chunks.pkl"), 'wb') as f:
                pickle.dump(self.chunks, f)

            with open(os.path.join(path, "metadata.pkl"), 'wb') as f:
                pickle.dump(self.metadata, f)

            print(f"✅ Vector database saved to {path}")

In [None]:
vector_db = VectorDatabase()
vector_db.build_index(all_chunks, chunk_metadata)

Loading SentenceTransformer model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Model loaded successfully!
Creating embeddings for 2227 chunks...


Batches:   0%|          | 0/70 [00:00<?, ?it/s]

✅ FAISS index created with 2227 vectors (dimension: 384)


In [None]:
class MOSDACBot:
    def __init__(self, vector_db: VectorDatabase):
        self.vector_db = vector_db
        self.text_processor = TextProcessor()
        self.conversation_history = []

        # MOSDAC-specific domain knowledge
        self.domain_keywords = [
            "ISRO", "INSAT", "IRS", "Kalpana", "INSAT-3D", "SCATSAT", "OCEANSAT",
            "meteorological", "oceanographic", "land", "atmosphere", "satellite",
            "DPS", "VEDAS", "Bhuvan", "imagery", "weather", "climate", "data",
            "download", "registration", "portal", "dataset", "product", "MOSDAC"
        ]

        # Query expansion dictionary
        self.query_expansions = {
            "satellite": ["remote sensing", "earth observation", "imagery", "data"],
            "weather": ["meteorological", "climate", "atmospheric", "temperature"],
            "data": ["dataset", "information", "records", "files", "products"],
            "download": ["access", "retrieve", "obtain", "get"],
            "registration": ["signup", "account", "login", "user", "register"],
            "MOSDAC": ["ISRO", "satellite", "data", "portal", "meteorological"]
        }

    def expand_query(self, query: str) -> str:
        """Expand query with related terms for better search"""
        expanded_terms = []
        query_lower = query.lower()

        for key, expansions in self.query_expansions.items():
            if key.lower() in query_lower:
                expanded_terms.extend(expansions[:2])  # Add top 2 related terms

        if expanded_terms:
            return f"{query} {' '.join(expanded_terms)}"
        return query

    def enhance_results_with_domain_knowledge(self, results: List[Tuple[str, float, Dict]],
                                            query: str) -> List[Tuple[str, float, Dict]]:
        """Enhance search results using domain knowledge"""
        enhanced_results = []
        query_lower = query.lower()

        for chunk, score, metadata in results:
            chunk_lower = chunk.lower()

            # Boost score for domain-specific keywords
            domain_boost = 0.0
            for keyword in self.domain_keywords:
                if keyword.lower() in chunk_lower:
                    domain_boost += 0.05

            # Boost for title relevance
            title_boost = 0.0
            if metadata.get('title'):
                title_words = set(metadata['title'].lower().split())
                query_words = set(query_lower.split())
                overlap = len(title_words.intersection(query_words))
                title_boost = overlap * 0.03

            enhanced_score = min(score + domain_boost + title_boost, 1.0)
            enhanced_results.append((chunk, enhanced_score, metadata))

        # Sort by enhanced score
        return sorted(enhanced_results, key=lambda x: x[1], reverse=True)

    def generate_answer(self, query: str, results: List[Tuple[str, float, Dict]]) -> str:
        """Generate comprehensive answer from search results"""
        if not results:
            return ("I couldn't find relevant information about your query in the MOSDAC "
                   "website content. Please try rephrasing your question or visit "
                   "www.mosdac.gov.in for more information.")

        # Extract most relevant sentences
        top_chunks = [chunk for chunk, score, metadata in results[:3]]
        all_sentences = []

        for chunk in top_chunks:
            sentences = sent_tokenize(chunk)
            for sentence in sentences:
                if len(sentence.strip()) > 30:
                    # Calculate sentence relevance
                    query_words = set(query.lower().split())
                    sentence_words = set(sentence.lower().split())
                    overlap = len(query_words.intersection(sentence_words))

                    if overlap > 0:
                        all_sentences.append((sentence, overlap))

        if all_sentences:
            # Sort by relevance and create answer
            all_sentences.sort(key=lambda x: x[1], reverse=True)
            top_sentences = [sent for sent, _ in all_sentences[:4]]
            return " ".join(top_sentences)
        else:
            return f"Based on the available information: {top_chunks[0][:300]}..."

    def chat(self, query: str, max_results: int = 5) -> Dict:
        """Main chat function"""
        start_time = time.time()

        # Expand query for better search
        expanded_query = self.expand_query(query)

        # Search vector database
        results = self.vector_db.search(expanded_query, k=max_results, threshold=0.5)

        # Enhance results with domain knowledge
        enhanced_results = self.enhance_results_with_domain_knowledge(results, query)

        # Generate answer
        answer = self.generate_answer(query, enhanced_results)

        # Prepare sources
        sources = []
        for chunk, score, metadata in enhanced_results:
            sources.append({
                'title': metadata['title'],
                'url': metadata['url'],
                'confidence': round(score, 3),
                'preview': chunk[:150] + "..." if len(chunk) > 150 else chunk
            })

        # Calculate metrics
        avg_confidence = sum(score for _, score, _ in enhanced_results) / len(enhanced_results) if enhanced_results else 0.0
        response_time = time.time() - start_time

        response = {
            'query': query,
            'answer': answer,
            'sources': sources,
            'confidence': round(avg_confidence, 3),
            'response_time': round(response_time, 3),
            'sources_found': len(sources),
            'query_expanded': expanded_query != query,
            'timestamp': datetime.now().isoformat()
        }

        # Add to conversation history
        self.conversation_history.append(response)

        return response

In [None]:
bot = MOSDACBot(vector_db)
print("✅ MOSDAC Bot initialized and ready!")

✅ MOSDAC Bot initialized and ready!


In [None]:
def comprehensive_evaluation():
    """Run comprehensive evaluation of the bot"""
    test_queries = [
        "What is MOSDAC?",
        "How to access satellite data?",
        "What are the available datasets?",
        "How to register for data access?",
        "How to download meteorological data?",
        "What is INSAT satellite?",
        "How to get weather information?",
        "How to contact MOSDAC support?",
        "What data products are available?",
        "How to access oceanographic data?",
    ]

    print("🔍 Running Comprehensive Bot Evaluation")
    print("=" * 50)

    all_results = []
    total_time = 0
    high_confidence_count = 0

    for i, query in enumerate(test_queries, 1):
        print(f"\n{i:2d}. Testing: {query}")

        response = bot.chat(query)
        total_time += response['response_time']

        if response['confidence'] > 0.7:
            high_confidence_count += 1

        print(f"    ✓ Confidence: {response['confidence']:.3f}")
        print(f"    ✓ Response Time: {response['response_time']:.3f}s")
        print(f"    ✓ Sources: {response['sources_found']}")
        print(f"    ✓ Query Expanded: {'Yes' if response['query_expanded'] else 'No'}")

        all_results.append(response)

    # Calculate overall metrics
    avg_confidence = np.mean([r['confidence'] for r in all_results])
    avg_response_time = np.mean([r['response_time'] for r in all_results])
    avg_sources = np.mean([r['sources_found'] for r in all_results])
    success_rate = (high_confidence_count / len(test_queries)) * 100

    print(f"\n📊 EVALUATION RESULTS")
    print("=" * 30)
    print(f"Total Queries Tested: {len(test_queries)}")
    print(f"Average Confidence: {avg_confidence:.3f}")
    print(f"Average Response Time: {avg_response_time:.3f}s")
    print(f"Average Sources per Query: {avg_sources:.1f}")
    print(f"High Confidence Queries (>0.7): {high_confidence_count}/{len(test_queries)} ({success_rate:.1f}%)")
    print(f"Total Evaluation Time: {total_time:.2f}s")

    # Performance categorization
    if avg_confidence >= 0.8:
        performance = "🟢 EXCELLENT"
    elif avg_confidence >= 0.6:
        performance = "🟡 GOOD"
    else:
        performance = "🔴 NEEDS IMPROVEMENT"

    print(f"Overall Performance: {performance}")

    return all_results

In [None]:
evaluation_results = comprehensive_evaluation()

🔍 Running Comprehensive Bot Evaluation

 1. Testing: What is MOSDAC?
    ✓ Confidence: 0.925
    ✓ Response Time: 0.024s
    ✓ Sources: 5
    ✓ Query Expanded: Yes

 2. Testing: How to access satellite data?
    ✓ Confidence: 0.826
    ✓ Response Time: 0.020s
    ✓ Sources: 5
    ✓ Query Expanded: Yes

 3. Testing: What are the available datasets?
    ✓ Confidence: 0.746
    ✓ Response Time: 0.019s
    ✓ Sources: 2
    ✓ Query Expanded: Yes

 4. Testing: How to register for data access?
    ✓ Confidence: 0.727
    ✓ Response Time: 0.018s
    ✓ Sources: 1
    ✓ Query Expanded: Yes

 5. Testing: How to download meteorological data?
    ✓ Confidence: 0.753
    ✓ Response Time: 0.019s
    ✓ Sources: 5
    ✓ Query Expanded: Yes

 6. Testing: What is INSAT satellite?
    ✓ Confidence: 0.918
    ✓ Response Time: 0.018s
    ✓ Sources: 5
    ✓ Query Expanded: Yes

 7. Testing: How to get weather information?
    ✓ Confidence: 0.691
    ✓ Response Time: 0.017s
    ✓ Sources: 5
    ✓ Query Expand

In [None]:
def interactive_chat():
    """Interactive chat interface with the bot"""
    print("\n🤖 MOSDAC Bot - Interactive Chat")
    print("=" * 40)
    print("Ask me anything about MOSDAC!")
    print("Commands:")
    print("  'quit' or 'exit' - Exit chat")
    print("  'help' - Show sample questions")
    print("  'stats' - Show bot statistics")
    print("-" * 40)

    sample_questions = [
        "What is MOSDAC?",
        "How do I download satellite data?",
        "What datasets are available?",
        "How to register for access?",
        "What is INSAT satellite?",
        "How to access weather data?",
        "What is the data policy?",
        "How to use VEDAS portal?"
    ]

    while True:
        try:
            user_input = input("\n🔍 Your question: ").strip()

            if user_input.lower() in ['quit', 'exit', 'q']:
                print("\n👋 Thank you for using MOSDAC Bot!")
                print(f"💬 Total questions asked: {len(bot.conversation_history)}")
                break

            if user_input.lower() == 'help':
                print("\n💡 Sample questions you can ask:")
                for i, q in enumerate(sample_questions, 1):
                    print(f"   {i}. {q}")
                continue

            if user_input.lower() == 'stats':
                print(f"\n📈 Bot Statistics:")
                print(f"   Documents indexed: {len(documents)}")
                print(f"   Text chunks: {len(all_chunks)}")
                print(f"   Conversations: {len(bot.conversation_history)}")
                if bot.conversation_history:
                    avg_conf = np.mean([r['confidence'] for r in bot.conversation_history])
                    print(f"   Average confidence: {avg_conf:.3f}")
                continue

            if not user_input:
                print("Please enter a question.")
                continue

            # Get bot response
            print("\n🔄 Searching MOSDAC knowledge base...")
            response = bot.chat(user_input)

            # Display response
            print(f"\n💡 Answer:")
            print(f"{response['answer']}")

            print(f"\n📊 Response Metrics:")
            print(f"   Confidence: {response['confidence']:.3f}")
            print(f"   Response Time: {response['response_time']:.3f}s")
            print(f"   Sources Found: {response['sources_found']}")

            if response['sources'] and len(response['sources']) > 0:
                print(f"\n📚 Top Sources:")
                for i, source in enumerate(response['sources'][:3], 1):
                    print(f"   {i}. {source['title']} (confidence: {source['confidence']:.3f})")
                    if source['url'].startswith('http'):
                        print(f"      URL: {source['url']}")

        except KeyboardInterrupt:
            print("\n\n👋 Chat interrupted. Goodbye!")
            break
        except Exception as e:
            print(f"\n❌ Error: {e}")
            continue

print("✅ Interactive chat ready!")

✅ Interactive chat ready!


In [None]:
vector_db.save_index("mosdac_vector_db")

✅ Vector database saved to mosdac_vector_db


In [None]:
# Save evaluation results
with open("evaluation_results.json", "w") as f:
    json.dump(evaluation_results, f, indent=2)

# Save bot configuration and stats
bot_info = {
    "total_documents": len(documents),
    "total_chunks": len(all_chunks),
    "vector_dimension": vector_db.embeddings.shape[1] if vector_db.embeddings is not None else 0,
    "model_name": "all-MiniLM-L6-v2",
    "chunk_size": 512,
    "chunk_overlap": 50,
    "domain_keywords": bot.domain_keywords,
    "creation_date": datetime.now().isoformat()
}

with open("bot_info.json", "w") as f:
    json.dump(bot_info, f, indent=2)

print("✅ All data saved!")

# Utility functions
def quick_search(query: str, show_details: bool = True):
    """Quick search function"""
    response = bot.chat(query)

    if show_details:
        print(f"Query: {query}")
        print(f"Answer: {response['answer']}")
        print(f"Confidence: {response['confidence']:.3f}")
        print(f"Sources: {response['sources_found']}")

    return response

✅ All data saved!


In [None]:
interactive_chat()


🤖 MOSDAC Bot - Interactive Chat
Ask me anything about MOSDAC!
Commands:
  'quit' or 'exit' - Exit chat
  'help' - Show sample questions
  'stats' - Show bot statistics
----------------------------------------

🔄 Searching MOSDAC knowledge base...

💡 Answer:
It is a ISRO data portal which provides data through its web based service httpsmosdac.gov.in How to be a registered user of MOSDAC? MOSDAC is the short form of Meteorological and Oceanographic Satellite Data Archival Center. There is SignUp form available on MOSDAC portal. SAC is responsible for the development, realization and qualification of communication, navigation, earth observation and planetary payloads and related data processing and ground systems in the areas of communications, broadcasting, remote sensing and disaster monitoring  mitigation.

📊 Response Metrics:
   Confidence: 0.925
   Response Time: 0.032s
   Sources Found: 5

📚 Top Sources:
   1.  Faq-Page (confidence: 0.987)
      URL: https://www.mosdac.gov.in/faq-