<a href="https://colab.research.google.com/github/Cloud-Course-Group-Phoenix/Project-Pheonix/blob/main/Logic/Indexmqtt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os, sys

try:
    #Clone the GitHub repository if not already present
    if not os.path.exists("/content/Project-Pheonix"):
        !git clone https://github.com/Cloud-Course-Group-Phoenix/Project-Pheonix.git /content/Project-Pheonix

    # Change directory to project root
    %cd /content/Project-Pheonix

    # Checkout the 'main' branch
    !git fetch origin -q
    !git checkout main -q

    # Add project directory to Python path
    sys.path.append("/content/Project-Pheonix/Logic")
    %pip install -q importnb
    from importnb import Notebook
    with Notebook():
        import CloudDB as dbService
    from bs4 import BeautifulSoup
    import nltk
    import requests
    import re
    from urllib.parse import urljoin, urlparse
    from nltk.stem import PorterStemmer
    from nltk.corpus import stopwords
except Exception as e:
    print("❌ Setup failed:", str(e))

# Download Stop words
nltk.download('stopwords', quiet=True)
# Site to index
url = 'https://mqtt.org/'

Cloning into '/content/Project-Pheonix'...
remote: Enumerating objects: 511, done.[K
remote: Counting objects: 100% (86/86), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 511 (delta 52), reused 13 (delta 13), pack-reused 425 (from 1)[K
Receiving objects: 100% (511/511), 1.89 MiB | 13.99 MiB/s, done.
Resolving deltas: 100% (279/279), done.
/content/Project-Pheonix
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
class QueryService:
    def __init__(self,url):
        self.url = url
        self.stemmer = PorterStemmer()

    def fetch_page(self):
        response = requests.get(self.url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup
        else:
            return None

    def index_words(self, soup, index = {}, url = ''):
        words = re.findall(r'\w+', soup.get_text())
        for word in words:
            word = word.lower()
            # Apply stemming
            stemmed_word = self.stemmer.stem(word)

            # Check if stemmed word already exists in the index
            if stemmed_word in index:
                # Check if the original word is already in the index under this stem
                if word in index[stemmed_word]:
                    # Increment the appearances count for this specific word
                    index[stemmed_word][word]["Appearances"] += 1
                    # Add URL to DocIDs if it's not already there
                    if url and url not in index[stemmed_word][word]["DocIDs"]:
                        index[stemmed_word][word]["DocIDs"].append(url)
                else:
                    # Add this original word form to the stemmed word entry
                    index[stemmed_word][word] = {
                        "Appearances": 1,
                        "DocIDs": [url] if url else []
                    }
            else:
                # Initialize a new entry for this stemmed word
                index[stemmed_word] = {
                    word: {
                        "Appearances": 1,
                        "DocIDs": [url] if url else []
                    }
                }

        return index

    def remove_stop_words(self, index):
      stop_words = set(stopwords.words('english'))
      # Create a list of stemmed stop words
      stemmed_stop_words = [self.stemmer.stem(stop_word) for stop_word in stop_words]

      # Remove all stemmed stop words from the index
      for stemmed_stop_word in stemmed_stop_words:
        if stemmed_stop_word in index:
          del index[stemmed_stop_word]

      return index

class Crawler:
  def __init__(self, url):
    self.url = url

  #Fetches all sub urls from a given url
  def get_sub_urls(self, url):
    sub_urls = []
    stack = [url]
    while stack:
      url = stack.pop()
      response = requests.get(url)
      response.raise_for_status()  # Raise an exception for bad responses
      soup = BeautifulSoup(response.content, 'html.parser')
      for link in soup.find_all('a', href=True):
          href = link['href']
          absolute_url = urljoin(url, href)  # Make URL absolute

          if (absolute_url.startswith(url)) and (absolute_url != url) and (absolute_url not in sub_urls):
              sub_urls.append(absolute_url)
              stack.append(absolute_url)

    return sub_urls


def index_mqtt_website():
    print("Starting the indexing process...")
    crawler = Crawler(url)
    sub_urls = crawler.get_sub_urls(url)
    print(f"📄 Found {len(sub_urls)} pages to index")
    index = {}
    page_count = 0
    for sub_url in sub_urls:
        print(f"Indexing: {sub_url}")
        queryService = QueryService(sub_url)
        soup = queryService.fetch_page()
        if soup:
            index = queryService.index_words(soup, index, sub_url)
            page_count += 1

    # Remove stop words after building the whole index
    index = queryService.remove_stop_words(index)

    time = dbService.insert_to_db_index(index, page_count)
    print("Index saved to database.")
    success_message = f"✅ Re-indexing complete!\n📊 Indexed {len(index)} unique words from {page_count} pages\n🕒 Completed at: {time}"
    return success_message




