# Import Necessary Libraries

In [1]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Sample Text

In [9]:
sample_text = """
The Cow is a very useful animal and gives us milk. Milk considered a complete and nutritious food. The Cow is a domestic and religious animal. In India, it is a ritual and custom to worship Cow. Cow’s milk used in the pooja, Abhishek, and other holy Everyone called Coweryone “Gau Mata” to give her mother-like status in the Hindu religion. It has a large body, four legs, one long tail, two horns, two ears, two eyes, one big nose, one big mouth, and also one head. Moreover cow found in almost every region of the country.

It is found in different shapes and sizes. Cow found in our country become small however big cows found in other countries. We should take good care of the Cow and give her quality food and clean water. She eats green grasses, food, grains, hay, and other things. First, she chews the food well and slowly swallows to her stomach. Her back is long and wide.
"""

# Tokenization

In [10]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
sentences = sent_tokenize(sample_text)
words = [word_tokenize(sentence) for sentence in sentences]

# Lowercasing and Removing Special Characters

In [12]:
cleaned_words = [[re.sub(r'[^a-zA-Z0-9]', '', word.lower()) for word in sentence] for sentence in words]

# Removing Stopwords

In [13]:
stop_words = set(stopwords.words('english'))
filtered_words = [[word for word in sentence if word not in stop_words] for sentence in cleaned_words]

# Stemming and Lemmatization

In [14]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_words = [[stemmer.stem(word) for word in sentence] for sentence in filtered_words]
lemmatized_words = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in filtered_words]

# Printing Processed Sentences

In [15]:
print("Original Sentences:")
for sentence in sentences:
    print(sentence)

print("\nProcessed Sentences (Lemmatized):")
for sentence in lemmatized_words:
    print(' '.join(sentence))

Original Sentences:

The Cow is a very useful animal and gives us milk.
Milk considered a complete and nutritious food.
The Cow is a domestic and religious animal.
In India, it is a ritual and custom to worship Cow.
Cow’s milk used in the pooja, Abhishek, and other holy Everyone called Coweryone “Gau Mata” to give her mother-like status in the Hindu religion.
It has a large body, four legs, one long tail, two horns, two ears, two eyes, one big nose, one big mouth, and also one head.
Moreover cow found in almost every region of the country.
It is found in different shapes and sizes.
Cow found in our country become small however big cows found in other countries.
We should take good care of the Cow and give her quality food and clean water.
She eats green grasses, food, grains, hay, and other things.
First, she chews the food well and slowly swallows to her stomach.
Her back is long and wide.

Processed Sentences (Lemmatized):
cow useful animal give u milk 
milk considered complete nutri