In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Define the texts
text1 = "We need to finalize the demo corpus which will be used for this question and it should be done soon !!. It should be done by the ending of this month. But will it? This question has been run 4 times !! Not sure if it can be done on time with 100% accuracy."
text2 = "<div><h2> This is an NLP course </h2><h3>  The course requires reading two text books </h3><p>The first is named Practical NLP & the second is named My NLP  </p><h4>  The number of the course is 11223344</div>"

# Function to clean Text#1
def clean_text1(text):
    text = text.lower()  # Convert to lower case
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    text = text.strip()  # Remove trailing spaces
    return text

# Function to clean Text#2
def clean_text2(text):
    text = text.lower()  # Convert to lower case
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    return text

# Function to perform TF/IDF representation
def tf_idf_representation(*texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return vectorizer.get_feature_names_out(), tfidf_matrix.toarray()

# Clean the texts
cleaned_text1 = clean_text1(text1)
cleaned_text2 = clean_text2(text2)

# Perform TF/IDF representation
vocab, tfidf_values = tf_idf_representation(cleaned_text1, cleaned_text2)

# Print before and after for both texts
print(f"Original Text#1:\n{text1}\nCleaned Text#1:\n{cleaned_text1}\n")
print(f"Original Text#2:\n{text2}\nCleaned Text#2:\n{cleaned_text2}\n")

# Print final IDF values for all words in the vocabulary
print("Vocabulary and IDF Values:")
for word, idf in zip(vocab, tfidf_values[0]):  # Just showing for the first text for brevity
    print(f"{word}: {idf}")

Original Text#1:
We need to finalize the demo corpus which will be used for this question and it should be done soon !!. It should be done by the ending of this month. But will it? This question has been run 4 times !! Not sure if it can be done on time with 100% accuracy.
Cleaned Text#1:
we need to finalize the demo corpus which will be used for this question and it should be done soon  it should be done by the ending of this month but will it this question has been run 4 times  not sure if it can be done on time with 100 accuracy

Original Text#2:
<div><h2> This is an NLP course </h2><h3>  The course requires reading two text books </h3><p>The first is named Practical NLP & the second is named My NLP  </p><h4>  The number of the course is 11223344</div>
Cleaned Text#2:
 this is an nlp course   the course requires reading two text books the first is named practical nlp & the second is named my nlp    the number of the course is 11223344

Vocabulary and IDF Values:
100: 0.1065474513852