In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None) 

In [5]:
## Stemming in NLTK

In [6]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
stemmer = PorterStemmer()

In [7]:
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]

for word in words:
    print(word, "|", stemmer.stem(word))

eating | eat
eats | eat
eat | eat
ate | ate
adjustable | adjust
rafting | raft
ability | abil
meeting | meet


In [8]:
## Lemmatization in Spacy

In [9]:
import spacy

In [11]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("Mando talked for 3 hours although talking isn't his thing")
doc = nlp("eating eats eat ate adjustable rafting ability meeting better")
for token in doc:
    print(token, " | ", token.lemma_,  token.lemma)

eating  |  eat 9837207709914848172
eats  |  eat 9837207709914848172
eat  |  eat 9837207709914848172
ate  |  eat 9837207709914848172
adjustable  |  adjustable 6033511944150694480
rafting  |  raft 7154368781129989833
ability  |  ability 11565809527369121409
meeting  |  meeting 14798207169164081740
better  |  well 4525988469032889948


In [12]:
## Customizing lemmatizer

In [13]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [14]:

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token, " | ", token.lemma_,  token.lemma)

Bro  |  bro 3493238095688267532
,  |  , 2593208677638477497
you  |  you 7624161793554793053
wanna  |  wanna 13000462173222681081
go  |  go 8004577259940138793
?  |  ? 8205403955989537350
Brah  |  Brah 5645766505577852541
,  |  , 2593208677638477497
do  |  do 2158845516055552166
n't  |  not 447765159362469301
say  |  say 8685289367999165211
no  |  no 13055779130471031426
!  |  ! 17494803046312582752
I  |  I 4690420944186131903
am  |  be 10382539506755952630
exhausted  |  exhaust 5738807065439247694


In [16]:
ar = nlp.get_pipe('attribute_ruler')

ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]],{"LEMMA":"Brother"}) # adding custom rule

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust


In [17]:
## exercise

In [19]:
# Exercise1:

# Convert these list of words into base form using Stemming and Lemmatization and observe the transformations
# Write a short note on the words that have different base words using stemming and Lemmatization

In [20]:
#using stemming in nltk
lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']

In [21]:
for word in lst_words:
    print(word, "|", stemmer.stem(word))

running | run
painting | paint
walking | walk
dressing | dress
likely | like
children | children
whom | whom
good | good
ate | ate
fishing | fish


In [22]:
#using lemmatization in spacy

doc = nlp("running painting walking dressing likely children who good ate fishing")

In [28]:

for token in doc:
    print(token, " | ", token.lemma_," | ",  token.lemma)

running  |  run  |  12767647472892411841
painting  |  paint  |  16929211676819693673
walking  |  walk  |  1674876016505392235
dressing  |  dress  |  12815368344456308931
likely  |  likely  |  6740298879949941214
children  |  child  |  737253710922290542
who  |  who  |  3876862883474502309
good  |  good  |  5711639017775284443
ate  |  eat  |  9837207709914848172
fishing  |  fishing  |  10959402079719336560


In [31]:
from nltk.stem import WordNetLemmatizer

In [32]:
lemma = WordNetLemmatizer()

In [40]:
# lst_words

In [44]:
import nltk

# nltk.download('wordnet')

In [39]:
for word in lst_words:
    print(word, "|", lemma.lemmatize(word))

running | running
painting | painting
walking | walking
dressing | dressing
likely | likely
children | child
whom | whom
good | good
ate | ate
fishing | fishing


In [41]:
# Exercise2:

# convert the given text into it's base form using both stemming and lemmatization

In [42]:
text = """Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.
"""

In [47]:
#using stemming in nltk


#step1: Word tokenizing (spacy)

nlp = spacy.load("en_core_web_sm")


doc = nlp(text)

In [48]:
doc

Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.

In [59]:

for word in doc:
    print(word, stemmer.stem(word))

TypeError: 'int' object is not callable

In [67]:
using_spacy = []

for word in doc:
    print(word, "|", stemmer.stem(str(word)))
    using_spacy.append(stemmer.stem(str(word)))

Latha | latha
is | is
very | veri
multi | multi
talented | talent
girl | girl
. | .
She | she
is | is
good | good
at | at
many | mani
skills | skill
like | like
dancing | danc
, | ,
running | run
, | ,
singing | sing
, | ,
playing | play
. | .
She | she
also | also
likes | like
eating | eat
Pav | pav
Bhagi | bhagi
. | .
she | she
has | ha
a | a
habit | habit
of | of
fishing | fish
and | and
swimming | swim
too | too
. | .
Besides | besid
all | all
this | thi
, | ,
she | she
is | is
a | a
wonderful | wonder
at | at
cooking | cook
too | too
. | .

 | 



In [68]:
using_spacy

['latha',
 'is',
 'veri',
 'multi',
 'talent',
 'girl',
 '.',
 'she',
 'is',
 'good',
 'at',
 'mani',
 'skill',
 'like',
 'danc',
 ',',
 'run',
 ',',
 'sing',
 ',',
 'play',
 '.',
 'she',
 'also',
 'like',
 'eat',
 'pav',
 'bhagi',
 '.',
 'she',
 'ha',
 'a',
 'habit',
 'of',
 'fish',
 'and',
 'swim',
 'too',
 '.',
 'besid',
 'all',
 'thi',
 ',',
 'she',
 'is',
 'a',
 'wonder',
 'at',
 'cook',
 'too',
 '.',
 '\n']

In [69]:
" ".join(using_spacy)

'latha is veri multi talent girl . she is good at mani skill like danc , run , sing , play . she also like eat pav bhagi . she ha a habit of fish and swim too . besid all thi , she is a wonder at cook too . \n'

In [72]:
#step1: Word tokenizing (nltk)

doc2 = nltk.word_tokenize(text)

In [70]:
using_nltk = []

for word in doc2:
    print(word, "|", stemmer.stem(word))
    using_nltk.append(stemmer.stem(word))

Latha | latha
is | is
very | veri
multi | multi
talented | talent
girl.She | girl.sh
is | is
good | good
at | at
many | mani
skills | skill
like | like
dancing | danc
, | ,
running | run
, | ,
singing | sing
, | ,
playing.She | playing.sh
also | also
likes | like
eating | eat
Pav | pav
Bhagi | bhagi
. | .
she | she
has | ha
a | a
habit | habit
of | of
fishing | fish
and | and
swimming | swim
too.Besides | too.besid
all | all
this | thi
, | ,
she | she
is | is
a | a
wonderful | wonder
at | at
cooking | cook
too | too
. | .


In [71]:
" ".join(using_nltk)

'latha is veri multi talent girl.sh is good at mani skill like danc , run , sing , playing.sh also like eat pav bhagi . she ha a habit of fish and swim too.besid all thi , she is a wonder at cook too .'

In [75]:
#using lemmatisation in spacy


#step2: getting the base form for each token using spacy 'lemma_'

spacy_lemma = []

for token in doc:
    print(token, " | ", spacy.explain(token.pos_), " | ", token.lemma_)
    spacy_lemma.append(token.lemma_)


Latha  |  proper noun  |  Latha
is  |  auxiliary  |  be
very  |  adverb  |  very
multi  |  adjective  |  multi
talented  |  adjective  |  talented
girl  |  noun  |  girl
.  |  punctuation  |  .
She  |  pronoun  |  she
is  |  auxiliary  |  be
good  |  adjective  |  good
at  |  adposition  |  at
many  |  adjective  |  many
skills  |  noun  |  skill
like  |  adposition  |  like
dancing  |  noun  |  dancing
,  |  punctuation  |  ,
running  |  noun  |  running
,  |  punctuation  |  ,
singing  |  noun  |  singing
,  |  punctuation  |  ,
playing  |  verb  |  play
.  |  punctuation  |  .
She  |  pronoun  |  she
also  |  adverb  |  also
likes  |  verb  |  like
eating  |  verb  |  eat
Pav  |  proper noun  |  Pav
Bhagi  |  proper noun  |  Bhagi
.  |  punctuation  |  .
she  |  pronoun  |  she
has  |  verb  |  have
a  |  determiner  |  a
habit  |  noun  |  habit
of  |  adposition  |  of
fishing  |  noun  |  fishing
and  |  coordinating conjunction  |  and
swimming  |  verb  |  swim
too  |  adverb  

In [77]:

#step3: joining all words in a list into string using 'join()'

" ".join(spacy_lemma)

'Latha be very multi talented girl . she be good at many skill like dancing , running , singing , play . she also like eat Pav Bhagi . she have a habit of fishing and swim too . besides all this , she be a wonderful at cook too . \n'