In [18]:
# Install the required packages for our script
!pip install requests beautifulsoup4 nltk python-docx

# Import the necessary libraries for web scraping and text processing
import requests
from bs4 import BeautifulSoup
import re
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from docx import Document

# Function to fetch the webpage content
def fetch_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        # If the request is successful, parse the page content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        # If the request fails, return None
        return None

# Function to index the words in the webpage content
def index_words(soup):
    index = {}
    # Find all words in the page text
    words = re.findall(r'\w+', soup.get_text())
    for word in words:
        # Convert words to lowercase for uniformity
        word = word.lower()
        # Count the frequency of each word
        if word in index:
            index[word] += 1
        else:
            index[word] = 1
    return index

# Function to remove common stop words from the index
def remove_stop_words(index):
    # Define a set of stop words to remove
    stop_words = {
        'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at',
        'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', "can't", 'cannot',
        'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'few',
        'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll",
        "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll",
        "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', 'let', "let's", 'me', 'more',
        'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought',
        'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should',
        "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves',
        'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through',
        'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were',
        "weren't", 'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom',
        'why', "why's", 'with', "won't", 'would', "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours',
        'yourself', 'yourselves', 'n', 'also', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
        'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
        'nine', 'zero'
    }
    for stop_word in stop_words:
        # Remove the stop word from the index if it exists
        if stop_word in index:
            del index[stop_word]
    return index

# Function to apply stemming to the words in the index
def apply_stemming(index):
    stemmer = PorterStemmer()
    stemmed_index = {}
    for word, count in index.items():
        # Apply stemming to each word
        stemmed_word = stemmer.stem(word)
        # Build a new index with stemmed words
        if stemmed_word in stemmed_index:
            stemmed_index[stemmed_word] += count
        else:
            stemmed_index[stemmed_word] = count
    return stemmed_index

# Function to sort the index by frequency
def sort_index_by_frequency(index, frequency_threshold):
    # Convert the dictionary to a list of tuples, filter by frequency threshold, and sort by frequency in descending order
    sorted_filtered_index = sorted(
        [(word, freq) for word, freq in index.items() if freq >= frequency_threshold],
        key=lambda item: item[1], reverse=True
    )
    return sorted_filtered_index

# Function to create a Word document with the index table
def create_word_table(sorted_index, file_name):
    doc = Document()
    doc.add_heading('Index of Significant Words', level=1)

    # Add a table with two columns: Term and Frequency
    table = doc.add_table(rows=1, cols=2)
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = 'Term'
    hdr_cells[1].text = 'Frequency'

    for word, frequency in sorted_index:
        row_cells = table.add_row().cells
        row_cells[0].text = word
        row_cells[1].text = str(frequency)

    # Save the document
    print("Saving")
    doc.save(file_name)

# Main function to create the index for a webpage
def create_index(url):
    # Fetch the webpage content
    soup = fetch_page(url)
    if soup is None:
        # Return None if the page couldn't be fetched
        return None
    # Index the words on the page
    index = index_words(soup)
    # Remove stop words from the index
    index = remove_stop_words(index)
    # Apply stemming to the index
    index = apply_stemming(index)
    return sort_index_by_frequency(index, frequency_threshold=50)

# URL of the OnShape glossary page
url = 'https://cad.onshape.com/help/Content/Glossary/glossary.htm?tocpath=_____19'

# Create the index for the given URL
index = create_index(url)
create_word_table(index, 'index_table.docx')

# Print the resulting index
print(index)
print(len(index))


Saving
[('context', 676), ('type', 656), ('see', 638), ('keyboard', 631), ('shortcut', 631), ('plan', 626), ('part', 524), ('studio', 369), ('assembl', 279), ('draw', 226), ('sketch', 220), ('select', 185), ('creat', 174), ('enterpris', 166), ('document', 164), ('render', 159), ('view', 151), ('simul', 149), ('profession', 145), ('use', 139), ('tool', 130), ('featur', 118), ('mate', 109), ('onshap', 98), ('menu', 98), ('face', 94), ('option', 86), ('edg', 86), ('panel', 77), ('appear', 76), ('surfac', 75), ('point', 74), ('sheet', 70), ('allow', 67), ('plane', 67), ('model', 63), ('open', 61), ('list', 58), ('enabl', 55), ('curv', 55), ('display', 53), ('properti', 52), ('tabl', 52)]
43
