<a href="https://colab.research.google.com/github/DvirHayat/IntroToCloud/blob/main/HW2/index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import re
from nltk.stem import PorterStemmer

# Fetch the URL and parse as an HTML
def fetch_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        return None

# Get text from soup and add count its appearance
def index_words(soup):
    index = {}
    words = re.findall(r'\w+', soup.get_text())
    for word in words:
        word = word.lower()
        if word in index:
            index[word] += 1
        else:
            index[word] = 1
    return index

# Removes stop words from all words
def remove_stop_words(index):
    # stop_words = {'a', 'an', 'the', 'and', 'or', 'in', 'on', 'at'}
    stop_words = {'a', 'an', 'the', 'and', 'or', 'in', 'on', 'at', 'to', 'see', 'also', 'all', 'n', 'of'}  # Another version of stop words to remove from counting.
    for stop_word in stop_words:
        if stop_word in index:
            del index[stop_word]
    return index

# Stemm the words
def apply_stemming(index):
    stemmer = PorterStemmer()
    stemmed_index = {}
    for word, count in index.items():
        stemmed_word = stemmer.stem(word)
        if stemmed_word in stemmed_index:
            stemmed_index[stemmed_word] += count
        else:
            stemmed_index[stemmed_word] = count
    return stemmed_index

# URL of the glossary page
url = "https://cad.onshape.com/help/Content/Glossary/glossary.htm"

# Fetch and process the page
soup = fetch_page(url)
if soup:
    # Extract and index the words from the webpage
    index = index_words(soup)

    # Remove stop words
    cleaned_index = remove_stop_words(index)

    # Apply stemming to the indexed words
    stemmed_index = apply_stemming(cleaned_index)

    # Identify the 10 most frequent words
    top_10_words = sorted(stemmed_index.items(), key=lambda item: item[1], reverse=True)[:10]
    print(top_10_words)
else:
    print("Failed to fetch the webpage content.")


[('context', 676), ('type', 656), ('keyboard', 631), ('shortcut', 631), ('plan', 626), ('part', 524), ('studio', 369), ('assembl', 279), ('draw', 226), ('sketch', 220)]
