In [None]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
# Step 1: URL to extract HTML from
url = "https://en.wikipedia.org/wiki/Mohamed_Salah"

# Step 2: Extract HTML from URL
response = requests.get(url)
html = response.text

In [None]:
# Step 3: Extract text from HTML page (paragraphs and headings)
soup = BeautifulSoup(html, "html.parser")

# Extract paragraphs (p tags)
paragraphs = soup.find_all("p")
paragraph_text = [p.get_text() for p in paragraphs]

# Extract headings (h1, h2, h3, etc. tags)
headings = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
heading_text = [heading.get_text() for heading in headings]

# Combine paragraph and heading text
combined_text = paragraph_text + heading_text

# Print the combined text
for text in combined_text:
    print(text)



Mohamed Salah Hamed Mahrous Ghaly (Arabic: محمد صلاح حامد محروس غالي, Egyptian Arabic pronunciation: [mæˈħam.mæd sˤɑˈlɑːħ ˈɣæːli];[5] born 15 June 1992), known as Mohamed Salah or Mo Salah, is an Egyptian professional footballer who plays as a right winger or forward for Premier League club Liverpool and captains the Egypt national team. Regarded as one of the best players of his generation and among the greatest African players of all time, he is known for his clinical finishing, dribbling and speed.[6][7][8]

Salah started his senior career in 2010 playing for Al Mokawloon Al Arab, departing in 2012 to join Basel, where he won two Swiss Super League titles. In 2014, Salah joined Chelsea for a reported fee of £11 million, but limited gametime led to successive loans to Fiorentina and Roma, who later signed him permanently for €15 million. In the 2016–17 season, Salah was a key figure in Roma's unsuccessful title bid, reaching double figures in both goals and assists. In 2017, Salah 

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Step 1: Cleaning data
cleaned_text = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in combined_text]

# Step 2: Normalization
normalized_text = [text.lower() for text in cleaned_text]

# Step 3: Tokenization
tokenized_text = [word_tokenize(text) for text in normalized_text]

# Step 4: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_text = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in tokenized_text]

# Step 5: Remove stop words
stop_words = set(stopwords.words('english'))
filtered_text = [[word for word in tokens if word not in stop_words] for tokens in lemmatized_text]

# Print the processed text
for text in filtered_text:
    print(text)

[]
['mohamed', 'salah', 'hamed', 'mahrous', 'ghaly', 'arabic', 'egyptian', 'arabic', 'pronunciation', 'mammd', 'sl', 'li5', 'born', '15', 'june', '1992', 'known', 'mohamed', 'salah', 'mo', 'salah', 'egyptian', 'professional', 'footballer', 'play', 'right', 'winger', 'forward', 'premier', 'league', 'club', 'liverpool', 'captain', 'egypt', 'national', 'team', 'regarded', 'one', 'best', 'player', 'generation', 'among', 'greatest', 'african', 'player', 'time', 'known', 'clinical', 'finishing', 'dribbling', 'speed678']
['salah', 'started', 'senior', 'career', '2010', 'playing', 'al', 'mokawloon', 'al', 'arab', 'departing', '2012', 'join', 'basel', 'two', 'swiss', 'super', 'league', 'title', '2014', 'salah', 'joined', 'chelsea', 'reported', 'fee', '11', 'million', 'limited', 'gametime', 'led', 'successive', 'loan', 'fiorentina', 'rom', 'later', 'signed', 'permanently', '15', 'million', '201617', 'season', 'salah', 'wa', 'key', 'figure', 'roma', 'unsuccessful', 'title', 'bid', 'reaching', 'do

In [None]:
# Flatten the list of lists into a single list of words
all_words = [word for sublist in filtered_text for word in sublist]

# Get unique words
unique_words = set(all_words)

# Print unique words
print("Unique Words:")
for word in unique_words:
    print(word)

Unique Words:
house330
humanitarian
rose
93rd
overtaking
2010
century148
haaland219
20
revealed
deal
2013
endured
dr
preliminary
opportunity
159th
known
aviv
tear
klopp
exchange
port
humble
house
drop
personal
david
january
assist
450
sent
razgrad
season2425
losing
2014
forward
raising
title
muscle
project310
place138
flight171
29
across
francesco
stadium221
unstoppable
figure
gordon
performing
years154
jos
assisted
missed
midfielder
united178
american
shearer
brace
stronger
suggested
kept
advance123
another
shootout
strip
origin
joining
club148
pusks
nominated
earning
april
9
olympics237
controversially
final
derby
uruguay
allowance
advocate
saint
unbelievable
media314
service
68
22
jakobpark
oscar
allowed
filed
model
worldwide
128
squad
presidential
it331332
draw38
globally
help
edged
approval
52m
profile
sparta
cleared
robbed
delivered
rankhof
arising
leftback
forcing
mecca288
world339
newcastle
hoffenheim
child
hardworking
goals120
d254
sport
everton
greatest
201617
decision146147


In [None]:
less_three = [word for word in unique_words if len(word) < 3]
print(less_three)


['20', 'dr', '29', '9', '68', '22', '76', 'en', '12', '16', 'x', '1', '71', 'st', 'sl', '21', '36', '19', '26', '47', '23', '44', '3', '33', 'ac', '30', '32', '10', '4', '54', '40', 'wa', '43', 'el', '24', '42', 'fa', 'u', '8', '2', '63', '18', '52', '72', '13', '41', '61', '6', '7', 'al', '15', '65', '5', 'mo', '74', '53', '11', 'ha', '51', '55', 'go', '60', '66', '31', '17', '70', '38', 'gq', '95', '14', '50', '25', '28']
