## Stemming in NLTK

In [1]:
import spacy
import nltk

In [2]:
from nltk.stem import PorterStemmer
stemmer  = PorterStemmer()

In [6]:
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]
for word in words:
    print(word, " | ", stemmer.stem(word))

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  ate
adjustable  |  adjust
rafting  |  raft
ability  |  abil
meeting  |  meet


In [22]:
lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']
for word in lst_words:
    print(word, " | ",stemmer.stem(word))

running  |  run
painting  |  paint
walking  |  walk
dressing  |  dress
likely  |  like
children  |  children
whom  |  whom
good  |  good
ate  |  ate
fishing  |  fish


## Lemmatization in Spacy

In [8]:
nlp = spacy.load('en_core_web_sm')

doc = nlp("eating eats eat ate adjustable rafting ability meeting better")
for token in doc:
    print(token, "|", token.lemma_ , " | ", token.lemma)

eating | eat  |  9837207709914848172
eats | eat  |  9837207709914848172
eat | eat  |  9837207709914848172
ate | eat  |  9837207709914848172
adjustable | adjustable  |  6033511944150694480
rafting | raft  |  7154368781129989833
ability | ability  |  11565809527369121409
meeting | meeting  |  14798207169164081740
better | well  |  4525988469032889948


In [9]:
doc = nlp("Mando talked for 3 hours although talking isn't his thing")
for token in doc:
    print(token, "|", token.lemma_ , " | ", token.lemma)

Mando | Mando  |  7837215228004622142
talked | talk  |  13939146775466599234
for | for  |  16037325823156266367
3 | 3  |  602994839685422785
hours | hour  |  9748623380567160636
although | although  |  343236316598008647
talking | talk  |  13939146775466599234
is | be  |  10382539506755952630
n't | not  |  447765159362469301
his | his  |  2661093235354845946
thing | thing  |  2473243759842082748


In [24]:
doc = nlp("running painting walking dressing likely children who good ate fishing")
for token in doc:
    print(token, " | ", token.lemma_)


running  |  run
painting  |  paint
walking  |  walk
dressing  |  dress
likely  |  likely
children  |  child
who  |  who
good  |  good
ate  |  eat
fishing  |  fishing


## Customizing lemmatizer

In [13]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [20]:
ar = nlp.get_pipe('attribute_ruler')

ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]],{"LEMMA":"Brother"})


doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust
