In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None) 

In [5]:
## Stemming in NLTK

In [6]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [7]:
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]

for word in words:
    print(word, "|", stemmer.stem(word))

eating | eat
eats | eat
eat | eat
ate | ate
adjustable | adjust
rafting | raft
ability | abil
meeting | meet


In [8]:
## Lemmatization in Spacy

In [9]:
import spacy

In [11]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("Mando talked for 3 hours although talking isn't his thing")
doc = nlp("eating eats eat ate adjustable rafting ability meeting better")
for token in doc:
    print(token, " | ", token.lemma_,  token.lemma)

eating  |  eat 9837207709914848172
eats  |  eat 9837207709914848172
eat  |  eat 9837207709914848172
ate  |  eat 9837207709914848172
adjustable  |  adjustable 6033511944150694480
rafting  |  raft 7154368781129989833
ability  |  ability 11565809527369121409
meeting  |  meeting 14798207169164081740
better  |  well 4525988469032889948


In [12]:
## Customizing lemmatizer

In [13]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [14]:

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token, " | ", token.lemma_,  token.lemma)

Bro  |  bro 3493238095688267532
,  |  , 2593208677638477497
you  |  you 7624161793554793053
wanna  |  wanna 13000462173222681081
go  |  go 8004577259940138793
?  |  ? 8205403955989537350
Brah  |  Brah 5645766505577852541
,  |  , 2593208677638477497
do  |  do 2158845516055552166
n't  |  not 447765159362469301
say  |  say 8685289367999165211
no  |  no 13055779130471031426
!  |  ! 17494803046312582752
I  |  I 4690420944186131903
am  |  be 10382539506755952630
exhausted  |  exhaust 5738807065439247694


In [16]:
ar = nlp.get_pipe('attribute_ruler')

ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]],{"LEMMA":"Brother"}) # adding custom rule

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust
