In [None]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
# Step 1: URL to extract HTML from
url = "https://en.wikipedia.org/wiki/Mohamed_Salah"

# Step 2: Extract HTML from URL
response = requests.get(url)
html = response.text

In [None]:
# Step 3: Extract text from HTML page (paragraphs and headings)
soup = BeautifulSoup(html, "html.parser")

# Extract paragraphs (p tags)
paragraphs = soup.find_all("p")
paragraph_text = [p.get_text() for p in paragraphs]

# Extract headings (h1, h2, h3, etc. tags)
headings = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
heading_text = [heading.get_text() for heading in headings]

# Combine paragraph and heading text
combined_text = paragraph_text + heading_text

# Print the combined text
for text in combined_text:
    print(text)

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
# Step 1: Cleaning data
cleaned_text = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in combined_text]

# Step 2: Normalization
normalized_text = [text.lower() for text in cleaned_text]

# Step 3: Tokenization
tokenized_text = [word_tokenize(text) for text in normalized_text]

# Step 4: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_text = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in tokenized_text]

# Step 5: Remove stop words
stop_words = set(stopwords.words('english'))
filtered_text = [[word for word in tokens if word not in stop_words] for tokens in lemmatized_text]

# Print the processed text
for text in filtered_text:
    print(text)

In [None]:
# Flatten the list of lists into a single list of words
all_words = [word for sublist in filtered_text for word in sublist]

# Get unique words
unique_words = set(all_words)

# Print unique words
print("Unique Words:")
for word in unique_words:
    print(word)

In [None]:
less_three = [word for word in unique_words if len(word) < 3]
print(less_three)
