In [1]:
import nltk

In [2]:
from nltk.stem.porter import PorterStemmer

In [3]:
ps= PorterStemmer()

In [5]:
words= ['run','runner', 'ran', 'runs', 'fairly', 'easily']

In [6]:
for word in words:
    print(word + '------->'+ ps.stem(word))

run------->run
runner------->runner
ran------->ran
runs------->run
fairly------->fairli
easily------->easili


In [7]:
from nltk.stem.snowball import SnowballStemmer

In [8]:
ps=  SnowballStemmer(language='english')

In [9]:
for word in words:
    print(word + '------->'+ ps.stem(word))

run------->run
runner------->runner
ran------->ran
runs------->run
fairly------->fair
easily------->easili


In [10]:
words= ['generous','generation', 'generously', 'generate']

In [11]:
for word in words:
    print(word + '------->'+ ps.stem(word))

generous------->generous
generation------->generat
generously------->generous
generate------->generat


# Lemmatization

In contrast to stemming lemmatization looks beyond word reduction, and considers a language's full vocabulary to apply a morphological analysis to words.

Lemmatization is typically seend as much more informative than simple stemming, which is why Spacy has opted to only have lemma available instead of Stemming

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc= nlp(u'I am a runner running in a race because T love to run since I ran today.')

In [4]:
doc

I am a runner running in a race because T love to run since I ran today.

In [6]:
for token in doc:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
T 	 PROPN 	 5582244037879929967 	 T
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today
. 	 PUNCT 	 12646065887601541794 	 .


In [7]:
# s string formatting

def show_lemma(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [8]:
doc1 = nlp(u'I saw ten nice today!')

In [10]:
show_lemma(doc1)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
ten          NUM    7970704286052693043    ten
nice         ADJ    14121509715367036122   nice
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !
