<a href="https://colab.research.google.com/github/Cloud-Course-Group-Phoenix/Project-Pheonix/blob/main/Logic/Indexmqtt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# pip installs
!pip install firebase

In [None]:
from firebase import firebase
from bs4 import BeautifulSoup
import nltk
import requests
import re
from urllib.parse import urljoin, urlparse
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from datetime import datetime

# Download required NLTK data
nltk.download('stopwords', quiet=True)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
DBLink = "https://couldproject-a621d-default-rtdb.europe-west1.firebasedatabase.app/"
url = "https://mqtt.org/"

class DbService:
    def __init__(self, DbLink):
        self.db_link = DbLink

    def insert_to_db(self, results, page_count):
        FBconn = firebase.FirebaseApplication(self.db_link, None)
        FBconn.put('/','terms', results)
        stats = FBconn.get('/', 'indexStats') or {}
        stats["word_count"] = len(results)
        stats["page_count"] = page_count
        stats["last_indexed"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        FBconn.put('/','indexStats', stats)
        return stats["last_indexed"]


    def get_from_db(self): # change into a more general statement
        FBconn = firebase.FirebaseApplication(self.db_link,None)
        results = FBconn.get('/','terms')
        return results

dbService = DbService(DBLink)

In [None]:
class QueryService:
    def __init__(self,url):
        self.url = url
        self.stemmer = PorterStemmer()

    def fetch_page(self):
        response = requests.get(self.url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup
        else:
            return None

    def index_words(self, soup, index = {}, url = ''):
        words = re.findall(r'\w+', soup.get_text())
        for word in words:
            word = word.lower()
            # Apply stemming
            stemmed_word = self.stemmer.stem(word)

            # Check if stemmed word already exists in the index
            if stemmed_word in index:
                # Check if the original word is already in the index under this stem
                if word in index[stemmed_word]:
                    # Increment the appearances count for this specific word
                    index[stemmed_word][word]["Appearances"] += 1
                    # Add URL to DocIDs if it's not already there
                    if url and url not in index[stemmed_word][word]["DocIDs"]:
                        index[stemmed_word][word]["DocIDs"].append(url)
                else:
                    # Add this original word form to the stemmed word entry
                    index[stemmed_word][word] = {
                        "Appearances": 1,
                        "DocIDs": [url] if url else []
                    }
            else:
                # Initialize a new entry for this stemmed word
                index[stemmed_word] = {
                    word: {
                        "Appearances": 1,
                        "DocIDs": [url] if url else []
                    }
                }

        return index

    def remove_stop_words(self, index):
      stop_words = set(stopwords.words('english'))
      # Create a list of stemmed stop words
      stemmed_stop_words = [self.stemmer.stem(stop_word) for stop_word in stop_words]

      # Remove all stemmed stop words from the index
      for stemmed_stop_word in stemmed_stop_words:
        if stemmed_stop_word in index:
          del index[stemmed_stop_word]

      return index

class Crawler:
  def __init__(self, url):
    self.url = url

  #Fetches all sub urls from a given url
  def get_sub_urls(self, url):
    sub_urls = []
    stack = [url]
    while stack:
      url = stack.pop()
      response = requests.get(url)
      response.raise_for_status()  # Raise an exception for bad responses
      soup = BeautifulSoup(response.content, 'html.parser')
      for link in soup.find_all('a', href=True):
          href = link['href']
          absolute_url = urljoin(url, href)  # Make URL absolute

          if (absolute_url.startswith(url)) and (absolute_url != url) and (absolute_url not in sub_urls):
              sub_urls.append(absolute_url)
              stack.append(absolute_url)

    return sub_urls


def index_mqtt_website():
    print("Starting the indexing process...")
    crawler = Crawler(url)
    sub_urls = crawler.get_sub_urls(url)
    print(f"📄 Found {len(sub_urls)} pages to index")
    index = {}
    page_count = 0
    for sub_url in sub_urls:
        print(f"Indexing: {sub_url}")
        queryService = QueryService(sub_url)
        soup = queryService.fetch_page()
        if soup:
            index = queryService.index_words(soup, index, sub_url)
            page_count += 1

    # Remove stop words after building the whole index
    index = queryService.remove_stop_words(index)

    time = dbService.insert_to_db(index, page_count)
    print("Index saved to database.")
    success_message = f"✅ Re-indexing complete!\n📊 Indexed {len(index)} unique words from {page_count} pages\n🕒 Completed at: {time}"
    return success_message







Starting the indexing process...
Indexing: https://mqtt.org/getting-started/
Indexing: https://mqtt.org/mqtt-specification/
Indexing: https://mqtt.org/software/
Indexing: https://mqtt.org/use-cases/
Indexing: https://mqtt.org/faq/
Indexing: https://mqtt.org/use-cases#automotive
Indexing: https://mqtt.org/use-cases#logistics
Indexing: https://mqtt.org/use-cases#manufacturing
Indexing: https://mqtt.org/use-cases#smarthome
Indexing: https://mqtt.org/use-cases#consumer-products
Indexing: https://mqtt.org/use-cases#transportation
Indexing: https://mqtt.org/legal
Indexing: https://mqtt.org/software/#shell-script
Indexing complete. Saving 1156 stemmed words to database.
Index saved to database.
