In [10]:
import nltk
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import re

In [11]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [12]:
text = "The quick brown foxes are jumping over the lazy dogs. Writing and believing are two important skills."

In [13]:
tokens = word_tokenize(text)

In [22]:
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')

In [23]:
def regex_stem(word):
    pattern = re.compile(r'(ing|ly|ed|ious|ies|ive|es|s|ment)?$')
    stem = re.sub(pattern, '', word)
    return stem

In [24]:
wordnet_lemmatizer = WordNetLemmatizer()
def wordnet_lemmatize(word):
    lemma = wordnet_lemmatizer.lemmatize(word, pos='v')
    return lemma

In [25]:
porter_stemmed = [porter.stem(token) for token in tokens]
lancaster_stemmed = [lancaster.stem(token) for token in tokens]
regex_stemmed = [regex_stem(token) for token in tokens]
snowball_stemmed = [snowball.stem(token) for token in tokens]
wordnet_lemmatized = [wordnet_lemmatize(token) for token in tokens]

In [27]:
print("Original Tokens:", tokens)
print("\nPorter Stemmer:", porter_stemmed)
print("\nLancaster Stemmer:", lancaster_stemmed)
print("\nRegex Stemmer:", regex_stemmed)
print("\nSnowball Stemmer:", snowball_stemmed)
print("\nWordNet Lemmatizer:", wordnet_lemmatized)

Original Tokens: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'lazy', 'dogs', '.', 'Writing', 'and', 'believing', 'are', 'two', 'important', 'skills', '.']

Porter Stemmer: ['the', 'quick', 'brown', 'fox', 'are', 'jump', 'over', 'the', 'lazi', 'dog', '.', 'write', 'and', 'believ', 'are', 'two', 'import', 'skill', '.']

Lancaster Stemmer: ['the', 'quick', 'brown', 'fox', 'ar', 'jump', 'ov', 'the', 'lazy', 'dog', '.', 'writ', 'and', 'believ', 'ar', 'two', 'import', 'skil', '.']

Regex Stemmer: ['The', 'quick', 'brown', 'fox', 'are', 'jump', 'over', 'the', 'lazy', 'dog', '.', 'Writ', 'and', 'believ', 'are', 'two', 'important', 'skill', '.']

Snowball Stemmer: ['the', 'quick', 'brown', 'fox', 'are', 'jump', 'over', 'the', 'lazi', 'dog', '.', 'write', 'and', 'believ', 'are', 'two', 'import', 'skill', '.']

WordNet Lemmatizer: ['The', 'quick', 'brown', 'fox', 'be', 'jump', 'over', 'the', 'lazy', 'dog', '.', 'Writing', 'and', 'believe', 'be', 'two', 'important', 'skills