# Stemming and Lemmatization for a Corpus

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import reuters
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [7]:
# Download WordNet resource, if not already downloaded
nltk.download("reuters")

# Initialze Stemmer and Lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

#Sample text from Reuters corpus
sample_text = reuters.raw('test/14826')

# Tokenize the text
words = word_tokenize(sample_text)

# Perform stemming
stemmed_words = [stemmer.stem(word) for word in words]

# Perform Lemmatization
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print("Original Words:", words)



[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [8]:
print("Stemmed words:", stemmed_words)

Stemmed words: ['asian', 'export', 'fear', 'damag', 'from', 'u.s.-japan', 'rift', 'mount', 'trade', 'friction', 'between', 'the', 'u.s.', 'and', 'japan', 'ha', 'rais', 'fear', 'among', 'mani', 'of', 'asia', "'s", 'export', 'nation', 'that', 'the', 'row', 'could', 'inflict', 'far-reach', 'econom', 'damag', ',', 'businessmen', 'and', 'offici', 'said', '.', 'they', 'told', 'reuter', 'correspond', 'in', 'asian', 'capit', 'a', 'u.s.', 'move', 'against', 'japan', 'might', 'boost', 'protectionist', 'sentiment', 'in', 'the', 'u.s.', 'and', 'lead', 'to', 'curb', 'on', 'american', 'import', 'of', 'their', 'product', '.', 'but', 'some', 'export', 'said', 'that', 'while', 'the', 'conflict', 'would', 'hurt', 'them', 'in', 'the', 'long-run', ',', 'in', 'the', 'short-term', 'tokyo', "'s", 'loss', 'might', 'be', 'their', 'gain', '.', 'the', 'u.s.', 'ha', 'said', 'it', 'will', 'impos', '300', 'mln', 'dlr', 'of', 'tariff', 'on', 'import', 'of', 'japanes', 'electron', 'good', 'on', 'april', '17', ',', 'i

In [9]:
print(words == stemmed_words)

False


In [10]:
print("Lemmatized words:", lemmatized_words)

