In [3]:
import nltk
from nltk.stem.porter import *
p_stemmer = PorterStemmer()

In [5]:
# And here we give it different and similar words, and we use the stem to analyze them.

words = ['run', 'runner', 'running', 'ran', 'runs', 'easily','fairly']

for word in words:
    print(word+'--> '+ p_stemmer.stem(word))

run--> run
runner--> runner
running--> run
ran--> ran
runs--> run
easily--> easili
fairly--> fairli


In [6]:
from nltk.stem.snowball import SnowballStemmer
s_stemmer = SnowballStemmer(language="english")

words = ['run', 'runner', 'running', 'ran', 'runs','easily','fairly']

for word in words:
    print(word+'-->'+s_stemmer.stem(word))

run-->run
runner-->runner
running-->run
ran-->ran
runs-->run
easily-->easili
fairly-->fair


In [8]:
words = ['generous', 'generation', 'generously', 'generate']

for word in words:
    print(word+ ' --> ' + s_stemmer.stem(word))

generous --> generous
generation --> generat
generously --> generous
generate --> generat


In [9]:
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()
ls = LancasterStemmer()

words = ["is", "was", "be", "been", "are", "were"]

for word in words:
    print(ps.stem(word))

is
wa
be
been
are
were


In [10]:
for word in words:
    print(ls.stem(word))

is
was
be
been
ar
wer


In [11]:
words = ['book', 'booking', 'booked', 'books', 'booker', 'bookstore']

for word in words:
    print(ps.stem(word))

book
book
book
book
booker
bookstor


In [12]:
for w in words:
    print(ls.stem(w))

book
book
book
book
book
bookst


In [13]:
sentence = 'had you booked the air booking yet ? if not try to book it ASAP since booking will be out of books'

words = word_tokenize(sentence)

for word in words:
    print(ls.stem(word))

had
you
book
the
air
book
yet
?
if
not
try
to
book
it
asap
sint
book
wil
be
out
of
book


In [14]:
word_list = ["friend", "friendship", "friends", 
"friendships","stabil","destabilize","misunderstanding","railroad","moonlight","football"]

print("{0:20} {1:20} {2:20}".format("Word", "PorterStemmer", "LancasterStemmer"))

for word in word_list:
    print("{0:20} {1:20} {2:20}".format(word, ps.stem(word), ls.stem(word)))


Word                 PorterStemmer        LancasterStemmer    
friend               friend               friend              
friendship           friendship           friend              
friends              friend               friend              
friendships          friendship           friend              
stabil               stabil               stabl               
destabilize          destabil             dest                
misunderstanding     misunderstand        misunderstand       
railroad             railroad             railroad            
moonlight            moonlight            moonlight           
football             footbal              footbal             


In [16]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc1 = nlp("I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(token.text, '\t',token.pos, '\t',token.lemma, '\t',token.lemma_)

I 	 95 	 4690420944186131903 	 I
am 	 87 	 10382539506755952630 	 be
a 	 90 	 11901859001352538922 	 a
runner 	 92 	 12640964157389618806 	 runner
running 	 100 	 12767647472892411841 	 run
in 	 85 	 3002984154512732771 	 in
a 	 90 	 11901859001352538922 	 a
race 	 92 	 8048469955494714898 	 race
because 	 98 	 16950148841647037698 	 because
I 	 95 	 4690420944186131903 	 I
love 	 100 	 3702023516439754181 	 love
to 	 94 	 3791531372978436496 	 to
run 	 100 	 12767647472892411841 	 run
since 	 98 	 10066841407251338481 	 since
I 	 95 	 4690420944186131903 	 I
ran 	 100 	 12767647472892411841 	 run
today 	 92 	 11042482332948150395 	 today


In [17]:
def show_lemmas(text):
    for token in text:
        print(f"{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}")

In [19]:
doc2 = nlp("I saw eighteen mice today!")
show_lemmas(doc2)

I            PRON   4690420944186131903    I
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words = ["cats","catchi","radii","feet","speech","runner"]

for word in words:
    print(lemmatizer.lemmatize(word))

cat
catchi
radius
foot
speech
runner


In [23]:
print(lemmatizer.lemmatize("meeting", "n"))
print(lemmatizer.lemmatize("meeting","v"))

meeting
meet


In [24]:
word_net = WordNetLemmatizer()

In [25]:
sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations="?:!.,;"
sentence_word = word_tokenize(sentence)
for word in sentence_word:
    if word in punctuations:
        sentence_word.remove(word)

sentence_word

['He',
 'was',
 'running',
 'and',
 'eating',
 'at',
 'same',
 'time',
 'He',
 'has',
 'bad',
 'habit',
 'of',
 'swimming',
 'after',
 'playing',
 'long',
 'hours',
 'in',
 'the',
 'Sun']

In [28]:
print("{0:20}{1:20}".format("Word","Lemma")) 
for word in sentence_word: 
 print ("{0:20}{1:20}".format(word,word_net.lemmatize(word))) 

Word                Lemma               
He                  He                  
was                 wa                  
running             running             
and                 and                 
eating              eating              
at                  at                  
same                same                
time                time                
He                  He                  
has                 ha                  
bad                 bad                 
habit               habit               
of                  of                  
swimming            swimming            
after               after               
playing             playing             
long                long                
hours               hour                
in                  in                  
the                 the                 
Sun                 Sun                 


In [30]:
for word in sentence_word: 
 print ("{0:20}{1:20}".format(word,word_net.lemmatize(word, pos="v"))) 

He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 
