In [11]:
!pip install --upgrade spacy
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy
  Downloading spacy-3.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 5.4 MB/s 
Collecting thinc<8.1.0,>=8.0.14
  Downloading thinc-8.0.17-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (660 kB)
[K     |████████████████████████████████| 660 kB 75.9 MB/s 
Collecting spacy-legacy<3.1.0,>=3.0.9
  Downloading spacy_legacy-3.0.9-py2.py3-none-any.whl (20 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.7-py3-none-any.whl (17 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 44.3 MB/s 
[?25hCollecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (457 kB)
[K     |██████████████████████████████

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 5.4 MB/s 
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 2.2.5
    Uninstalling en-core-web-sm-2.2.5:
      Successfully uninstalled en-core-web-sm-2.2.5
Successfully installed en-core-web-sm-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [1]:
import spacy #spacy has only lemmatization
import nltk #Nltk has both stemming and Lemmatization futures 

In [2]:
nlp = spacy.load("en_core_web_sm")
text = "This is amazingly beautiful and hatching eggs i am watching this watched this and went to go island"

doc = nlp(text)
for t in doc:
  print(t,"|",t.lemma_) #spacy does not have much option for you to control over the Lemmatization process

This | this
is | be
amazingly | amazingly
beautiful | beautiful
and | and
hatching | hatch
eggs | egg
i | I
am | be
watching | watch
this | this
watched | watch
this | this
and | and
went | go
to | to
go | go
island | island


In [6]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [3]:
#Stemming is returning the common base word and sometimes the baseword could have no meaning as it does not have any linguistic knowledge

words = ['Connects','Connecting','Connections','Connected','Connection','Connectings','Connect','Connector']

from nltk import PorterStemmer, SnowballStemmer, LancasterStemmer, RegexpStemmer

ps = PorterStemmer()
ss = SnowballStemmer(language='english')
ls = LancasterStemmer()
rs = RegexpStemmer('ing$|s$|e$|able$', min=4)

print("{0:20}{1:20}{2:20}{3:30}{4:40}".format("Word","Porter Stemmer","Snowball Stemmer","Lancaster Stemmer","Regexp Stemmer"))
for word in words:
    print("{0:20}{1:20}{2:20}{3:30}{4:40}".format(word,ps.stem(word),ss.stem(word),ls.stem(word),rs.stem(word)))

Word                Porter Stemmer      Snowball Stemmer    Lancaster Stemmer             Regexp Stemmer                          
Connects            connect             connect             connect                       Connect                                 
Connecting          connect             connect             connect                       Connect                                 
Connections         connect             connect             connect                       Connection                              
Connected           connect             connect             connect                       Connected                               
Connection          connect             connect             connect                       Connection                              
Connectings         connect             connect             connect                       Connecting                              
Connect             connect             connect             connect                

In [4]:
# Lemmatization is stemming the word with the existing linguistic knowledge

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

wnl = WordNetLemmatizer()
for word in words:
  print(word,"||",wnl.lemmatize(word))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Connects || Connects
Connecting || Connecting
Connections || Connections
Connected || Connected
Connection || Connection
Connectings || Connectings
Connect || Connect
Connector || Connector


In [5]:
#You can modify the behaviour of lemmatization by attribute ruler like this example you can add the slang words such as bro,brah and lemmatize the words to Brother

ar = nlp.get_pipe('attribute_ruler')

ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]],{"LEMMA":"Brother"})

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust
