In [6]:
# import requests
# from bs4 import BeautifulSoup

# def extract_html_text(url):
#     # Step 1: Fetch HTML from URL
#     response = requests.get(url)

#     if response.status_code == 200:
#         # Step 2: Extract text from HTML
#         html_content = response.text

#         # Step 3: Use BeautifulSoup to parse HTML and extract text
#         soup = BeautifulSoup(html_content, 'html.parser')

#         # Extracting text from paragraph tags (you can modify this according to your requirement)
#         text = '\n'.join([p.get_text() for p in soup.find_all('p')])

#         return text
#     else:
#         print("Failed to fetch URL:", url)
#         return None

# # Example usage:
# url = "https://www.healthline.com/nutrition/fasting-benefits"
# text = extract_html_text(url)
# if text:
#     print(text)


In [7]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def extract_html_text(url):
    # Step 1: Fetch HTML from URL
    response = requests.get(url)

    if response.status_code == 200:
        # Step 2: Extract text from HTML
        html_content = response.text

        # Step 3: Use BeautifulSoup to parse HTML and extract text
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extracting text from paragraph tags (you can modify this according to your requirement)
        text = '\n'.join([p.get_text() for p in soup.find_all('p')])

        return text
    else:
        print("Failed to fetch URL:", url)
        return None

def preprocess_text(text):
    # Step 1: Cleaning data from symbols and characters
    cleaned_text = re.sub(r'[^\w\s]', '', text)

    # Step 2: Normalization: make all the data to lower case
    cleaned_text = cleaned_text.lower()

    # Step 3: Tokenization: split the data to words
    tokens = word_tokenize(cleaned_text)

    # Step 4: Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Step 5: Stop words removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]

    return filtered_tokens

# Example usage:
url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
text = extract_html_text(url)
if text:
    preprocessed_tokens = preprocess_text(text)
    unique_words = set(preprocessed_tokens)
    print("Unique words:", unique_words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


