In [None]:
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from word2number import w2n
from contractions import contractions_dict

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Sentence boundary detection
    sentences = sent_tokenize(text)

    # Expand contractions
    def expand_contractions(text, contractions_dict=contractions_dict):
        contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                          flags=re.IGNORECASE|re.DOTALL)
        def expand_match(contraction):
            match = contraction.group(0)
            expanded_contraction = contractions_dict.get(match)\
                if contractions_dict.get(match)\
                else contractions_dict.get(match.lower())
            return expanded_contraction

        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text

    sentences = [expand_contractions(sentence) for sentence in sentences]

    # Remove special characters
    sentences = [re.sub(r'[^\w\s]', '', sentence) for sentence in sentences]

    # Lowercase all texts
    sentences = [sentence.lower() for sentence in sentences]

    # Convert number words to numeric form
    sentences = [w2n.word_to_num(sentence) if sentence.isdigit() else sentence for sentence in sentences]

    # Remove numbers
    sentences = [re.sub(r'\d+', '', sentence) for sentence in sentences]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    sentences = [' '.join([word for word in sentence.split() if word not in stop_words]) for sentence in sentences]

    # Phrase extraction (Example using NLTK's collocations)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = nltk.collocations.BigramCollocationFinder.from_words(word_tokenize(' '.join(sentences)))
    finder.apply_freq_filter(3)
    bigrams = finder.nbest(bigram_measures.pmi, 10)

    # Tokenization
    tokens = [word_tokenize(sentence) for sentence in sentences]

    return {
        'sentences': sentences,
        'bigrams': bigrams,
        'tokens': tokens
    }

# Dummy data
dummy_data = """
<html>
    <head><title>Test Data</title></head>
    <body>
        This is a test sentence. It contains some HTML tags, numbers like fifty and 123, and special characters! Let's test contractions too: won't, haven't, they're.
    </body>
</html>
"""

# Preprocess the dummy data
preprocessed_data = preprocess_text(dummy_data)

print("Sentences after preprocessing:", preprocessed_data['sentences'])
print("Extracted Bigrams:", preprocessed_data['bigrams'])
print("Tokenized Sentences:", preprocessed_data['tokens'])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Sentences after preprocessing: ['test data test sentence', 'contains html tags numbers like fifty special characters', 'let us test contractions']
Extracted Bigrams: []
Tokenized Sentences: [['test', 'data', 'test', 'sentence'], ['contains', 'html', 'tags', 'numbers', 'like', 'fifty', 'special', 'characters'], ['let', 'us', 'test', 'contractions']]


In [None]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.1.0 textsearch-0.0.24


In [None]:
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from word2number import w2n
from contractions import contractions_dict

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Initial Text
    print(f"Original Text:\n{text}\n")

    # Remove HTML tags
    text_no_html = BeautifulSoup(text, "html.parser").get_text()
    print(f"Text after removing HTML tags:\n{text_no_html}\n")

    # Remove extra whitespaces
    text_no_whitespace = re.sub(r'\s+', ' ', text_no_html).strip()
    print(f"Text after removing extra whitespaces:\n{text_no_whitespace}\n")

    # Sentence boundary detection
    sentences = sent_tokenize(text_no_whitespace)
    print(f"Sentences after boundary detection:\n{sentences}\n")

    # Expand contractions
    def expand_contractions(text, contractions_dict=contractions_dict):
        contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                          flags=re.IGNORECASE|re.DOTALL)
        def expand_match(contraction):
            match = contraction.group(0)
            expanded_contraction = contractions_dict.get(match)\
                if contractions_dict.get(match)\
                else contractions_dict.get(match.lower())
            return expanded_contraction

        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text

    sentences_expanded = [expand_contractions(sentence) for sentence in sentences]
    print(f"Sentences after expanding contractions:\n{sentences_expanded}\n")

    # Remove special characters
    sentences_no_special_chars = [re.sub(r'[^\w\s]', '', sentence) for sentence in sentences_expanded]
    print(f"Sentences after removing special characters:\n{sentences_no_special_chars}\n")

    # Lowercase all texts
    sentences_lowercase = [sentence.lower() for sentence in sentences_no_special_chars]
    print(f"Sentences after converting to lowercase:\n{sentences_lowercase}\n")

    # Convert number words to numeric form
    def convert_number_words(sentence):
        words = sentence.split()
        new_words = []
        for word in words:
            try:
                # Convert number words to numbers if possible
                num = w2n.word_to_num(word)
                new_words.append(str(num))
            except ValueError:
                # Keep the word if it's not a number word
                new_words.append(word)
        return ' '.join(new_words)

    sentences_numbers = [convert_number_words(sentence) for sentence in sentences_lowercase]
    print(f"Sentences after converting number words to numeric form:\n{sentences_numbers}\n")

    # Remove numbers
    sentences_no_numbers = [re.sub(r'\d+', '', sentence) for sentence in sentences_numbers]
    print(f"Sentences after removing numbers:\n{sentences_no_numbers}\n")

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    sentences_no_stopwords = [' '.join([word for word in sentence.split() if word not in stop_words]) for sentence in sentences_no_numbers]
    print(f"Sentences after removing stopwords:\n{sentences_no_stopwords}\n")

    # Phrase extraction (Example using NLTK's collocations)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = nltk.collocations.BigramCollocationFinder.from_words(word_tokenize(' '.join(sentences_no_stopwords)))
    finder.apply_freq_filter(3)
    bigrams = finder.nbest(bigram_measures.pmi, 10)
    print(f"Extracted Bigrams:\n{bigrams}\n")

    # Tokenization
    tokens = [word_tokenize(sentence) for sentence in sentences_no_stopwords]
    print(f"Tokenized Sentences:\n{tokens}\n")

    return {
        'sentences': sentences_no_stopwords,
        'bigrams': bigrams,
        'tokens': tokens
    }

# Dummy data
dummy_data = """<html>
    <head>
        <title>Sample HTML Document</title>
    </head>
    <body>
        <h1>Welcome to NLP Processing</h1>
        <p>This is a sample document with various HTML    elements. It includes numbers like one hundred and twenty-three, and also special characters: @, #, $. Furthermore, let's test contractions such as won't, can't, and doesn't. Lastly, check out some common phrases: data science, machine learning, and artificial intelligence!</p>
        <p>Another paragraph with different content. Numbers such as 2024, 15.6, and twenty-five should be handled. Also, more HTML tags and additional special characters like & and * should be removed.</p>
    </body>
</html>"""


# Preprocess the dummy data
preprocessed_data = preprocess_text(dummy_data)


Original Text:
<html>
    <head>
        <title>Sample HTML Document</title>
    </head>
    <body>
        <h1>Welcome to NLP Processing</h1>
        <p>This is a sample document with various HTML    elements. It includes numbers like one hundred and twenty-three, and also special characters: @, #, $. Furthermore, let's test contractions such as won't, can't, and doesn't. Lastly, check out some common phrases: data science, machine learning, and artificial intelligence!</p>
        <p>Another paragraph with different content. Numbers such as 2024, 15.6, and twenty-five should be handled. Also, more HTML tags and additional special characters like & and * should be removed.</p>
    </body>
</html>

Text after removing HTML tags:


Sample HTML Document


Welcome to NLP Processing
This is a sample document with various HTML    elements. It includes numbers like one hundred and twenty-three, and also special characters: @, #, $. Furthermore, let's test contractions such as won't, can't, a

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag, RegexpParser
from word2number import w2n
from contractions import contractions_dict


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def preprocess_text(text):

    print(f"Original Text:\n{text}\n")


    text_no_html = BeautifulSoup(text, "html.parser").get_text()
    print(f"Text after removing HTML tags:\n{text_no_html}\n")


    text_no_whitespace = re.sub(r'\s+', ' ', text_no_html).strip()
    print(f"Text after removing extra whitespaces:\n{text_no_whitespace}\n")


    sentences = sent_tokenize(text_no_whitespace)
    print(f"Sentences after boundary detection:\n{sentences}\n")


    def expand_contractions(text, contractions_dict=contractions_dict):
        contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                          flags=re.IGNORECASE|re.DOTALL)
        def expand_match(contraction):
            match = contraction.group(0)
            expanded_contraction = contractions_dict.get(match)\
                if contractions_dict.get(match)\
                else contractions_dict.get(match.lower())
            return expanded_contraction

        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text

    sentences_expanded = [expand_contractions(sentence) for sentence in sentences]
    print(f"Sentences after expanding contractions:\n{sentences_expanded}\n")


    sentences_no_special_chars = [re.sub(r'[^\w\s]', '', sentence) for sentence in sentences_expanded]
    print(f"Sentences after removing special characters:\n{sentences_no_special_chars}\n")


    sentences_lowercase = [sentence.lower() for sentence in sentences_no_special_chars]
    print(f"Sentences after converting to lowercase:\n{sentences_lowercase}\n")


    def convert_number_words(sentence):
        words = sentence.split()
        new_words = []
        for word in words:
            try:

                num = w2n.word_to_num(word)
                new_words.append(str(num))
            except ValueError:

                new_words.append(word)
        return ' '.join(new_words)

    sentences_numbers = [convert_number_words(sentence) for sentence in sentences_lowercase]
    print(f"Sentences after converting number words to numeric form:\n{sentences_numbers}\n")


    sentences_no_numbers = [re.sub(r'\d+', '', sentence) for sentence in sentences_numbers]
    print(f"Sentences after removing numbers:\n{sentences_no_numbers}\n")


    stop_words = set(stopwords.words('english'))
    sentences_no_stopwords = [' '.join([word for word in sentence.split() if word not in stop_words]) for sentence in sentences_no_numbers]
    print(f"Sentences after removing stopwords:\n{sentences_no_stopwords}\n")

    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = nltk.collocations.BigramCollocationFinder.from_words(word_tokenize(' '.join(sentences_no_stopwords)))
    finder.apply_freq_filter(3)
    bigrams = finder.nbest(bigram_measures.pmi, 10)
    print(f"Extracted Bigrams:\n{bigrams}\n")


    def extract_phrases(text):
        tokens = word_tokenize(text)
        tagged = pos_tag(tokens)
        chunk_grammar = "NP: {<DT>?<JJ>*<NN>+}"
        chunk_parser = RegexpParser(chunk_grammar)
        tree = chunk_parser.parse(tagged)
        phrases = set()
        for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
            phrase = " ".join(word for word, tag in subtree.leaves())
            phrases.add(phrase)
        return list(phrases)

    all_sentences_text = ' '.join(sentences_no_stopwords)
    phrases = extract_phrases(all_sentences_text)
    print(f"Extracted Phrases:\n{phrases}\n")


    tokens = [word_tokenize(sentence) for sentence in sentences_no_stopwords]
    print(f"Tokenized Sentences:\n{tokens}\n")

    return {
        'sentences': sentences_no_stopwords,
        'bigrams': bigrams,
        'phrases': phrases,
        'tokens': tokens
    }


html_data = """
<html>
    <head>
        <title>Sample HTML Document</title>
    </head>
    <body>
        <h1>Welcome to NLP Processing</h1>
        <p>This is a sample document with various HTML elements. It includes numbers like one hundred and twenty-three, and also special characters: @, #, $. Furthermore, let's test contractions such as won't, can't, and doesn't. Lastly, check out some common phrases: data science, machine learning, and artificial intelligence!</p>
        <p>Another paragraph with different content. Numbers such as 2024, 15.6, and twenty-five should be handled. Also, more HTML tags and additional special characters like & and * should be removed.</p>
    </body>
</html>
"""


preprocessed_data = preprocess_text(html_data)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


Original Text:

<html>
    <head>
        <title>Sample HTML Document</title>
    </head>
    <body>
        <h1>Welcome to NLP Processing</h1>
        <p>This is a sample document with various HTML elements. It includes numbers like one hundred and twenty-three, and also special characters: @, #, $. Furthermore, let's test contractions such as won't, can't, and doesn't. Lastly, check out some common phrases: data science, machine learning, and artificial intelligence!</p>
        <p>Another paragraph with different content. Numbers such as 2024, 15.6, and twenty-five should be handled. Also, more HTML tags and additional special characters like & and * should be removed.</p>
    </body>
</html>


Text after removing HTML tags:



Sample HTML Document


Welcome to NLP Processing
This is a sample document with various HTML elements. It includes numbers like one hundred and twenty-three, and also special characters: @, #, $. Furthermore, let's test contractions such as won't, can't, and 