In [7]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
# Sample HTML Text
html_text = """
<html>
<head><title>Sample HTML Document</title></head>
<body>
<p>This is a sample HTML document containing <b>bold</b> and <i>italic</i> text.</p>
<p>We'll use NLTK to preprocess this HTML text.</p>
</body>
</html>
"""

In [3]:
def process_html(html_text):

    clean_text = re.sub('<[^<]+?>', '', html_text) #Used to clean the html tags

    result = re.sub(r'\s', ' ', clean_text).strip() #Used to replace escape sequence

    return result

In [4]:
processed_text = process_html(html_text)
print(processed_text)

Sample HTML Document  This is a sample HTML document containing bold and italic text. We'll use NLTK to preprocess this HTML text.


In [5]:
tokens = word_tokenize(processed_text)
tokens

['Sample',
 'HTML',
 'Document',
 'This',
 'is',
 'a',
 'sample',
 'HTML',
 'document',
 'containing',
 'bold',
 'and',
 'italic',
 'text',
 '.',
 'We',
 "'ll",
 'use',
 'NLTK',
 'to',
 'preprocess',
 'this',
 'HTML',
 'text',
 '.']

In [6]:
stopwords = set(stopwords.words('english'))
punctuation = set(string.punctuation)
filterd_tokens = [word for word in tokens if word.lower() not in stopwords and word not in punctuation]
filterd_tokens

['Sample',
 'HTML',
 'Document',
 'sample',
 'HTML',
 'document',
 'containing',
 'bold',
 'italic',
 'text',
 "'ll",
 'use',
 'NLTK',
 'preprocess',
 'HTML',
 'text']

In [8]:
porter_stemmer = PorterStemmer()
stemmed_tokens = [porter_stemmer.stem(word) for word in filterd_tokens]
stemmed_tokens

['sampl',
 'html',
 'document',
 'sampl',
 'html',
 'document',
 'contain',
 'bold',
 'ital',
 'text',
 "'ll",
 'use',
 'nltk',
 'preprocess',
 'html',
 'text']