In [11]:
import re
import unicodedata
from textblob import TextBlob
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from langdetect import detect
from nltk import pos_tag

def clean_html_tags(text):
    clean_text = re.sub('<.*?>', '', text)
    return clean_text

def normalize_unicode(text):
    clean_text = ''.join(c for c in unicodedata.normalize('NFC', text) if c <= '\uFFFF')
    return clean_text

def correct_fat_finger(text):
    blob = TextBlob(text)
    corrected_text = str(blob.correct())
    return corrected_text

def tokenize_sentences(text):
    sentences = sent_tokenize(text)
    return sentences

def tokenize_words(sentence):
    words = word_tokenize(sentence)
    return words

def remove_stopwords(words):
    stop_words = set(stopwords.words("english"))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words

def stem_words(words):
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    return stemmed_words

def lemmatize_words(words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words

def remove_punctuation_digits(text):
    cleaned_text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    return cleaned_text

def convert_to_lowercase(text):
    lowercased_text = text.lower()
    return lowercased_text

def detect_language(text):
    language = detect(text)
    return language

def pos_tagging(sentence):
    words = word_tokenize(sentence)
    pos_tags = pos_tag(words)
    return pos_tags

# Example :
input_text = "<html><body>Hello, world! 😊</body></html>"
cleaned_text = clean_html_tags(input_text)
normalized_text = normalize_unicode(cleaned_text)
corrected_text = correct_fat_finger(normalized_text)
sentences = tokenize_sentences(corrected_text)
words = tokenize_words(sentences[0])
filtered_words = remove_stopwords(words)
stemmed_words = stem_words(filtered_words)
lemmatized_words = lemmatize_words(filtered_words)
cleaned_text_no_punct_digits = remove_punctuation_digits(corrected_text)
lowercased_text = convert_to_lowercase(cleaned_text)
language = detect_language(cleaned_text)

print("Original Text:", input_text)
print("Cleaned Text:", cleaned_text)
print("Normalized Text:", normalized_text)
print("Corrected Text:", corrected_text)
print("Tokenized Sentences:", sentences)
print("Tokenized Words:", words)
print("Words without Stopwords:", filtered_words)
print("Stemmed Words:", stemmed_words)
print("Lemmatized Words:", lemmatized_words)
print("Text without Punctuation and Digits:", cleaned_text_no_punct_digits)
print("Lowercased Text:", lowercased_text)
print("Detected Language:", language)


Original Text: <html><body>Hello, world! 😊</body></html>
Cleaned Text: Hello, world! 😊
Normalized Text: Hello, world! 
Corrected Text: Hello, world! 
Tokenized Sentences: ['Hello, world!']
Tokenized Words: ['Hello', ',', 'world', '!']
Words without Stopwords: ['Hello', ',', 'world', '!']
Stemmed Words: ['hello', ',', 'world', '!']
Lemmatized Words: ['Hello', ',', 'world', '!']
Text without Punctuation and Digits: Hello world 
Lowercased Text: hello, world! 😊
Detected Language: en
