In [15]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

In [10]:
text = 'The curious cat jumped onto the windowsill to watch the birds outside.'

In [11]:
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)  # remove punctuation and special characters
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

In [12]:
cleaned_text = clean_text(text)

In [13]:
cleaned_text

'the curious cat jumped onto the windowsill to watch the birds outside'

In [16]:
tokens = word_tokenize(cleaned_text)

In [17]:
tokens

['the',
 'curious',
 'cat',
 'jumped',
 'onto',
 'the',
 'windowsill',
 'to',
 'watch',
 'the',
 'birds',
 'outside']

In [18]:
stop_words = set(stopwords.words("english"))
filtered_tokens = [word for word in tokens if word not in stop_words]

In [19]:
filtered_tokens

['curious', 'cat', 'jumped', 'onto', 'windowsill', 'watch', 'birds', 'outside']

In [20]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

In [21]:
lemmatized_tokens

['curious', 'cat', 'jumped', 'onto', 'windowsill', 'watch', 'bird', 'outside']

In [22]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

In [23]:
stemmed_tokens

['curiou', 'cat', 'jump', 'onto', 'windowsil', 'watch', 'bird', 'outsid']

In [24]:
print("Original Text:", text)
print("Cleaned Text:", cleaned_text)
print("Tokens:", tokens)
print("Filtered Tokens (No Stop Words):", filtered_tokens)
print("Lemmatized Tokens:", lemmatized_tokens)
print("Stemmed Tokens:", stemmed_tokens)

Original Text: The curious cat jumped onto the windowsill to watch the birds outside.
Cleaned Text: the curious cat jumped onto the windowsill to watch the birds outside
Tokens: ['the', 'curious', 'cat', 'jumped', 'onto', 'the', 'windowsill', 'to', 'watch', 'the', 'birds', 'outside']
Filtered Tokens (No Stop Words): ['curious', 'cat', 'jumped', 'onto', 'windowsill', 'watch', 'birds', 'outside']
Lemmatized Tokens: ['curious', 'cat', 'jumped', 'onto', 'windowsill', 'watch', 'bird', 'outside']
Stemmed Tokens: ['curiou', 'cat', 'jump', 'onto', 'windowsil', 'watch', 'bird', 'outsid']
