<a href="https://colab.research.google.com/github/Cloud-Course-Group-Phoenix/Project-Pheonix/blob/main/Indexmqtt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# pip installs
!pip install firebase

#================================= make sure all pip installs are above this line ============================================

# import to clear the installation code output
from IPython.display import clear_output
clear_output()

In [None]:
from firebase import firebase
from bs4 import BeautifulSoup
import nltk
import requests
import re
from urllib.parse import urljoin, urlparse
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from datetime import datetime



# Download required NLTK data
try:
    nltk.download('stopwords', quiet=True)
except:
    pass

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
DBLink = "https://couldproject-a621d-default-rtdb.europe-west1.firebasedatabase.app/"
url = "https://mqtt.org/"

class DbService:
    def __init__(self, DbLink):
        self.db_link = DbLink

    def insert_to_db(self, results, page_count):
        try:
            print(f"🔄 Connecting to database...")
            FBconn = firebase.FirebaseApplication(self.db_link, None)
            
            print(f"📝 Saving {len(results)} terms to database...")
            FBconn.put('/','terms', results)
            
            print(f"📊 Updating indexing statistics...")
            stats = FBconn.get('/', 'indexStats') or {}
            stats["word_count"] = len(results)
            stats["page_count"] = page_count
            stats["last_indexed"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            FBconn.put('/','indexStats', stats)
            
            print(f"✅ Database operations completed successfully!")
            return stats["last_indexed"]
        except Exception as e:
            print(f"❌ Error saving to database: {str(e)}")
            print(f"🔍 Debug info - Results count: {len(results) if results else 0}, Page count: {page_count}")
            return None

    def get_stats(self):
        try:
            FBconn = firebase.FirebaseApplication(self.db_link, None)
            stats = FBconn.get('/', 'indexStats')
            return stats
        except Exception as e:
            print(f"❌ Error retrieving stats from database: {str(e)}")
            return None

    def get_from_db(self): # change into a more general statement
        try:
            FBconn = firebase.FirebaseApplication(self.db_link,None)
            results = FBconn.get('/','terms')
            return results
        except Exception as e:
            print(f"❌ Error retrieving data from database: {str(e)}")
            return None

dbService = DbService(DBLink)

In [None]:
class QueryService:
    def __init__(self,url):
        self.url = url
        self.stemmer = PorterStemmer()

    def fetch_page(self):
        try:
            response = requests.get(self.url, timeout=10)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                return soup
            else:
                print(f"⚠️ Warning: HTTP {response.status_code} for {self.url}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"❌ Error fetching {self.url}: {str(e)}")
            return None
        except Exception as e:
            print(f"❌ Unexpected error fetching {self.url}: {str(e)}")
            return None

    def index_words(self, soup, index = {}, url = ''):
        try:
            words = re.findall(r'\w+', soup.get_text())
            for word in words:
                word = word.lower()
                # Apply stemming
                stemmed_word = self.stemmer.stem(word)

                # Check if stemmed word already exists in the index
                if stemmed_word in index:
                    # Check if the original word is already in the index under this stem
                    if word in index[stemmed_word]:
                        # Increment the appearances count for this specific word
                        index[stemmed_word][word]["Appearances"] += 1
                        # Add URL to DocIDs if it's not already there
                        if url and url not in index[stemmed_word][word]["DocIDs"]:
                            index[stemmed_word][word]["DocIDs"].append(url)
                    else:
                        # Add this original word form to the stemmed word entry
                        index[stemmed_word][word] = {
                            "Appearances": 1,
                            "DocIDs": [url] if url else []
                        }
                else:
                    # Initialize a new entry for this stemmed word
                    index[stemmed_word] = {
                        word: {
                            "Appearances": 1,
                            "DocIDs": [url] if url else []
                        }
                    }
            return index
        except Exception as e:
            print(f"❌ Error indexing words from {url}: {str(e)}")
            return index

    def remove_stop_words(self, index):
        try:
            stop_words = set(stopwords.words('english'))
            # Create a list of stemmed stop words
            stemmed_stop_words = [self.stemmer.stem(stop_word) for stop_word in stop_words]

            # Remove all stemmed stop words from the index
            for stemmed_stop_word in stemmed_stop_words:
                if stemmed_stop_word in index:
                    del index[stemmed_stop_word]

            return index
        except Exception as e:
            print(f"❌ Error removing stop words: {str(e)}")
            return index

class Crawler:
    def __init__(self, url):
        self.url = url

    #Fetches all sub urls from a given url
    def get_sub_urls(self, url):
        sub_urls = []
        stack = [url]
        processed_urls = set()  # Track processed URLs to avoid infinite loops
        
        while stack:
            current_url = stack.pop()
            
            # Skip if already processed
            if current_url in processed_urls:
                continue
                
            processed_urls.add(current_url)
            
            try:
                response = requests.get(current_url, timeout=10)
                response.raise_for_status()  # Raise an exception for bad responses
                soup = BeautifulSoup(response.content, 'html.parser')
                
                for link in soup.find_all('a', href=True):
                    try:
                        href = link['href']
                        absolute_url = urljoin(current_url, href)  # Make URL absolute

                        if (absolute_url.startswith(url)) and (absolute_url != url) and (absolute_url not in sub_urls) and (absolute_url not in processed_urls):
                            sub_urls.append(absolute_url)
                            # Limit stack size to prevent excessive crawling
                            if len(stack) < 100:
                                stack.append(absolute_url)
                    except Exception as e:
                        print(f"⚠️ Warning: Error processing link {href}: {str(e)}")
                        continue
                        
            except requests.exceptions.RequestException as e:
                print(f"❌ Error crawling {current_url}: {str(e)}")
                continue
            except Exception as e:
                print(f"❌ Unexpected error crawling {current_url}: {str(e)}")
                continue

        return sub_urls


def index_mqtt_website():
    try:
        print("🚀 Starting the indexing process...")
        crawler = Crawler(url)
        
        print("🔍 Crawling website for pages...")
        sub_urls = crawler.get_sub_urls(url)
        print(f"📄 Found {len(sub_urls)} pages to index")
        
        if not sub_urls:
            print("⚠️ Warning: No pages found to index")
            return "❌ No pages found to index"
        
        index = {}
        page_count = 0
        successful_pages = 0
        
        for sub_url in sub_urls:
            try:
                print(f"📖 Indexing: {sub_url}")
                queryService = QueryService(sub_url)
                soup = queryService.fetch_page()
                if soup:
                    index = queryService.index_words(soup, index, sub_url)
                    page_count += 1
                    successful_pages += 1
                else:
                    print(f"⚠️ Skipping {sub_url} - failed to fetch")
            except Exception as e:
                print(f"❌ Error processing {sub_url}: {str(e)}")
                continue

        if not index:
            print("❌ No content was indexed")
            return "❌ Failed to index any content"

        print(f"🧹 Removing stop words from {len(index)} terms...")
        # Remove stop words after building the whole index
        queryService = QueryService(url)  # Create instance for stop word removal
        index = queryService.remove_stop_words(index)

        print(f"💾 Saving index to database...")
        time = dbService.insert_to_db(index, page_count)
        
        if time:
            print("✅ Index saved to database successfully.")
            success_message = f"✅ Re-indexing complete!\n📊 Indexed {len(index)} unique words from {successful_pages}/{len(sub_urls)} pages\n🕒 Completed at: {time}"
        else:
            success_message = f"⚠️ Indexing completed with database errors\n📊 Processed {len(index)} unique words from {successful_pages}/{len(sub_urls)} pages"
        
        return success_message
        
    except Exception as e:
        error_message = f"❌ Critical error during indexing: {str(e)}"
        print(error_message)
        return error_message







Starting the indexing process...
Indexing: https://mqtt.org/getting-started/
Indexing: https://mqtt.org/mqtt-specification/
Indexing: https://mqtt.org/software/
Indexing: https://mqtt.org/use-cases/
Indexing: https://mqtt.org/faq/
Indexing: https://mqtt.org/use-cases#automotive
Indexing: https://mqtt.org/use-cases#logistics
Indexing: https://mqtt.org/use-cases#manufacturing
Indexing: https://mqtt.org/use-cases#smarthome
Indexing: https://mqtt.org/use-cases#consumer-products
Indexing: https://mqtt.org/use-cases#transportation
Indexing: https://mqtt.org/legal
Indexing: https://mqtt.org/software/#shell-script
Indexing complete. Saving 1156 stemmed words to database.
Index saved to database.
