<a href="https://colab.research.google.com/github/Cloud-Course-Group-Phoenix/Project-Pheonix/blob/main/Logic/SearchService.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os, sys
%pip install -q importnb

try:
    #Clone the GitHub repository if not already present
    if not os.path.exists("/content/Project-Pheonix"):
        !git clone https://github.com/Cloud-Course-Group-Phoenix/Project-Pheonix.git /content/Project-Pheonix

    # Change directory to project root
    %cd /content/Project-Pheonix

    # Checkout the 'main' branch
    !git fetch origin -q
    !git checkout main -q

    # Add project directory to Python path
    sys.path.append("/content/Project-Pheonix/Logic")

    # Import notebook containing DB connection
    from importnb import Notebook
    with Notebook():
        import Admin as admin
        import CloudDB as dbService
    from bs4 import BeautifulSoup
    import nltk
    import re
    from urllib.parse import urljoin, urlparse
    from nltk.stem import PorterStemmer
except Exception as e:
    print("❌ Setup failed:", str(e))

# Site to index
url = 'https://mqtt.org/'


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCloning into '/content/Project-Pheonix'...
remote: Enumerating objects: 515, done.[K
remote: Counting objects: 100% (90/90), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 515 (delta 55), reused 13 (delta 13), pack-reused 425 (from 1)[K
Receiving objects: 100% (515/515), 1.89 MiB | 5.80 MiB/s, done.
Resolving deltas: 100% (282/282), done.
/content/Project-Pheonix
/content/Project-Pheonix
/content/Project-Pheonix


In [2]:
def search_word(query):
    if not query:
        return "🔎 Enter your search terms above\n\nSearch for MQTT related terms like 'broker', 'publish', 'subscribe', etc."
    index = dbService.get_from_db('terms')
    if not index:
        return "⚠️ Index not found\n\nNo index found in the database. Please run the indexing process first from the Admin Dashboard."

    # Track search terms for analytics
    admin.track_search_terms(query)

    # Process the query - split into individual words
    words = re.findall(r'\w+', query.lower())

    if not words:
        return "⚠️ Invalid search\n\nPlease enter valid search terms."

    # Initialize Porter Stemmer
    stemmer = PorterStemmer()

    # Dictionary to track all found URLs and their related words
    all_results = {}
    # Dictionary to track word exact appearance counts
    word_exact_appearances = {}
    # Dictionary to track stemmed word total appearances
    stemmed_word_appearances = {}
    # Keep track of words not found
    words_not_found = []
    # Dictionary to track which original forms were found for each search term
    word_to_original_forms = {}
    # Dictionary to map URLs to exact word appearances and stemmed appearances
    url_to_word_appearances = {}

    # Try to fetch content previews from URLs - empty dictionary to be populated
    url_content_previews = {}

    # Search for each word in the index using stemming
    for word in words:
        # Apply stemming to the search term
        stemmed_word = stemmer.stem(word)

        if stemmed_word in index:
            # Store the exact match appearances if the exact word exists
            exact_appearances = 0
            if word in index[stemmed_word]:
                exact_appearances = index[stemmed_word][word]["Appearances"]
                # Track documents where this exact word appears
                exact_doc_ids = index[stemmed_word][word].get("DocIDs", [])
                for doc_id in exact_doc_ids:
                    if doc_id not in url_to_word_appearances:
                        url_to_word_appearances[doc_id] = {}
                    if word not in url_to_word_appearances[doc_id]:
                        url_to_word_appearances[doc_id][word] = {"exact": 0, "stemmed": 0}
                    url_to_word_appearances[doc_id][word]["exact"] += exact_appearances

            # Collect all URLs from all word forms with this stem
            all_urls = set()
            total_appearances = 0
            related_forms = []

            # Process each original word form under this stem
            for original_word, data in index[stemmed_word].items():
                # Add this original form to the matched forms list
                related_forms.append(original_word)

                # Get URLs and appearances
                if "DocIDs" in data:
                    doc_ids = data["DocIDs"]
                    all_urls.update(doc_ids)

                    # Track stemmed appearances for each document
                    appearances = data.get("Appearances", 0)
                    for doc_id in doc_ids:
                        if doc_id not in url_to_word_appearances:
                            url_to_word_appearances[doc_id] = {}
                        if word not in url_to_word_appearances[doc_id]:
                            url_to_word_appearances[doc_id][word] = {"exact": 0, "stemmed": 0}
                        url_to_word_appearances[doc_id][word]["stemmed"] += appearances

                if "Appearances" in data:
                    total_appearances += data["Appearances"]

            # Record both the exact and total appearances for this word
            word_exact_appearances[word] = exact_appearances
            stemmed_word_appearances[word] = total_appearances
            word_to_original_forms[word] = related_forms

            # Add each URL to the results dictionary
            for url in all_urls:
                if url in all_results:
                    if word not in all_results[url]:
                        all_results[url].append(word)
                else:
                    all_results[url] = [word]
        else:
            words_not_found.append(word)

    # Format the results
    if not all_results:
        related_terms = []
        # Suggest related terms if available
        for word in words:
            if len(word) > 2:
                # Look for words that start with the same letters
                for stemmed_word in index.keys():
                    if stemmed_word.startswith(word[:2]):
                        for original_form in index[stemmed_word].keys():
                            if original_form not in related_terms:
                                related_terms.append(original_form)
                                if len(related_terms) >= 5:  # Limit to 5 suggestions
                                    break

        no_results = f"😕 No results found\n\nNo results found for: {', '.join(words)}"

        if related_terms:
            no_results += "\n\nDid you mean one of these terms?\n"
            for term in related_terms[:5]:
                no_results += f"• {term}\n"

        return no_results

    # Count the total number of URLs found
    total_urls = len(all_results)

    # Calculate both exact match appearances and stemmed appearances
    total_exact_appearances = sum(word_exact_appearances.values())
    total_stemmed_appearances = sum(stemmed_word_appearances.values())

    # Calculate relevance score for each URL based on:
    # 1. Number of matching search terms
    # 2. Number of exact matches vs stemmed matches
    # 3. Total appearances of search terms
    url_relevance_scores = {}
    for url, matched_words in all_results.items():
        # Base score from number of matched words (high weight)
        word_count_score = len(matched_words) * 50

        # Score from match quality
        exact_match_score = 0
        stemmed_match_score = 0
        total_match_score = 0

        for word in matched_words:
            if url in url_to_word_appearances and word in url_to_word_appearances[url]:
                exact_count = url_to_word_appearances[url][word]["exact"]
                stemmed_count = url_to_word_appearances[url][word]["stemmed"]

                # Exact matches get higher weight
                exact_match_score += exact_count * 3
                stemmed_match_score += (stemmed_count - exact_count) * 1
                total_match_score += exact_count + stemmed_count

        # Combine scores with appropriate weights
        url_relevance_scores[url] = word_count_score + exact_match_score + stemmed_match_score

    # Sort results by relevance score
    sorted_results = sorted(url_relevance_scores.items(), key=lambda x: x[1], reverse=True)

    # Get content previews for the top results (limited to 10 for performance)
    for url, _ in sorted_results[:10]:
        try:
            # Try to fetch the page content if not already cached
            if url not in url_content_previews:
                response = requests.get(url, timeout=2)  # Short timeout to prevent hanging
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')

                    # Get page title
                    title = soup.title.string if soup.title else url.split('/')[-1]

                    # Extract a relevant snippet
                    relevant_text = ""
                    paragraphs = soup.find_all(['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

                    for p in paragraphs:
                        text = p.get_text(strip=True)
                        if text and any(word.lower() in text.lower() for word in words):
                            relevant_text = text
                            break

                    # If no paragraph with search terms found, take the first non-empty paragraph
                    if not relevant_text and paragraphs:
                        for p in paragraphs:
                            text = p.get_text(strip=True)
                            if text and len(text) > 30:
                                relevant_text = text
                                break

                    # Truncate and clean up the snippet
                    if relevant_text:
                        if len(relevant_text) > 200:
                            relevant_text = relevant_text[:200] + "..."
                    else:
                        relevant_text = "No preview available"

                    url_content_previews[url] = {
                        "title": title,
                        "snippet": relevant_text
                    }
        except Exception:
            # On any error, provide a default preview
            url_content_previews[url] = {
                "title": url.split('/')[-1] if '/' in url else url,
                "snippet": "No preview available"
            }

    # Build the search results as HTML for better formatting
    result = f"<h3>🔍 Search Results</h3>"
    result += f"<p>Found {len(words) - len(words_not_found)} of {len(words)} search terms across {total_urls} pages<br>"
    result += f"Found {total_exact_appearances} exact matches and {total_stemmed_appearances - total_exact_appearances} related word forms</p>"

    # Add each search result as formatted HTML
    for i, (url, _) in enumerate(sorted_results, 1):
        matched_words = all_results[url]

        # Get the preview data
        preview = url_content_previews.get(url, {"title": url.split('/')[-1], "snippet": "No preview available"})
        title = preview["title"]
        snippet = preview["snippet"]

        # Format the result entry as HTML with clickable link
        result += f"<div style='margin-bottom: 15px;'>"
        result += f"<h4>{i}. {title}</h4>"
        result += f"<a href='{url}' target='_blank' style='color: #1a0dab;'>{url}</a><br>"
        result += f"<p>{snippet}</p>"
        result += f"<p><b>Matching terms:</b> {', '.join(matched_words)}</p>"
        result += "</div>"

    # Add information about words not found if any
    if words_not_found:
        result += f"<p><b>Terms not found:</b> {', '.join(words_not_found)}</p>"

    return result
