In [1]:
# Semantic Web Retrieval Project

## 1. Importing Libraries
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

## 2. Downloading NLTK Resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Uncomment the following lines to download NLTK resources if not already downloaded
# nltk.download('omw-1.4')
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

## 3. Fetching Website Content
# Define the URL
url = "https://mec.edu.om/en/"

# Fetch the HTML content
response = requests.get(url)
html_content = response.text

# Print the raw HTML content (optional)
# print(html_content)

 ## 4. Parsing the HTML Content
 # Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, "lxml")

# Print the structured representation of the content
print(soup.prettify())

## 5. Extracting Text and Preprocessing

# Extract text from the parsed HTML
text = soup.get_text()

# Preprocess the text
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Convert to lowercase and remove non-alphabetic tokens
    tokens = [token.lower() for token in tokens if token.isalpha()]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

# Preprocess the extracted text
tokens = preprocess_text(text)

# Print the preprocessed tokens
print(tokens)

## 6. Applying Semantic Analysis

# Apply semantic analysis to find synonyms for each token
for token in tokens:
    synsets = wordnet.synsets(token)
    print(f"{token}: {synsets}")
    
    

[nltk_data] Downloading package punkt to /Users/skylor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/skylor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/skylor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<!DOCTYPE html>
<html lang="en-US">
 <head>
  <script data-pagespeed-no-defer="" data-two-no-delay="" type="text/javascript">
   var two_worker_data_critical_data = {"critical_data":{"critical_css":false,"critical_fonts":false}}
  </script>
  <script data-pagespeed-no-defer="" data-two-no-delay="" type="text/javascript">
   var two_worker_data_font = {"font":[]}
  </script>
  <script data-pagespeed-no-defer="" data-two-no-delay="" type="text/javascript">
   var two_worker_data_excluded_js = {"js":[]}
  </script>
  <script data-pagespeed-no-defer="" data-two-no-delay="" type="text/javascript">
   var two_worker_data_js = {"js":[{"inline":true,"code":"JTBBJTA5dmFyJTIwZ3RtNHdwX2RhdGFsYXllcl9uYW1lJTIwJTNEJTIwJTIyZGF0YUxheWVyJTIyJTNCJTBBJTA5dmFyJTIwZGF0YUxheWVyJTIwJTNEJTIwZGF0YUxheWVyJTIwJTdDJTdDJTIwJTVCJTVEJTNCJTBB","id":"","uid":"two_670e4c94ad5b0","exclude_blob":false,"excluded_from_delay":false},{"inline":true,"code":"JTBBJTA5dmFyJTIwZGF0YUxheWVyX2NvbnRlbnQlMjAlM0QlMjAlN0IlMjJ2aXNpdG9yT