In [5]:
# Text: The original word text.
# Lemma: The base form of the word.
# POS: The simple part-of-speech tag.
# Tag: The detailed part-of-speech tag.
# Dep: Syntactic dependency, i.e. the relation between tokens.
# Shape: The word shape – capitalization, punctuation, digits.
# is alpha: Is the token an alpha character?
# is stop: Is the token part of a stop list, i.e. the most common words of the language?

In [2]:
x = "Srija is learning NLP"
import spacy
nlp = spacy.load("en_core_web_sm")
doc= nlp(x)
for token in doc:
    print(token.pos_)

PROPN
AUX
VERB
PROPN


In [1]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

data = []

for token in doc:
    data.append([token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
                 token.shape_, token.is_alpha, t_oken.isstop])

columns = ["Text", "Lemma", "POS", "Tag", "Dependency", "Shape", "IsAlpha", "IsStop"]
output = pd.DataFrame(data, columns=columns)
output


Unnamed: 0,Text,Lemma,POS,Tag,Dependency,Shape,IsAlpha,IsStop
0,Apple,Apple,PROPN,NNP,nsubj,Xxxxx,True,False
1,is,be,AUX,VBZ,aux,xx,True,True
2,looking,look,VERB,VBG,ROOT,xxxx,True,False
3,at,at,ADP,IN,prep,xx,True,True
4,buying,buy,VERB,VBG,pcomp,xxxx,True,False
5,U.K.,U.K.,PROPN,NNP,dobj,X.X.,False,False
6,startup,startup,NOUN,NN,dep,xxxx,True,False
7,for,for,ADP,IN,prep,xxx,True,True
8,$,$,SYM,$,quantmod,$,False,False
9,1,1,NUM,CD,compound,d,False,False


## Tokenization

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")
mystring=u'"We\'r moving to L.A.!"'
doc= nlp(mystring)
for token in doc:
    print(token.text,token.pos_)

" PUNCT
We'r NOUN
moving VERB
to ADP
L.A. PROPN
! PUNCT
" PUNCT


In [6]:
doc=nlp(u"that person's cat was very cute")
for chunk in doc.noun_chunks:
    print(chunk)

that person's cat


In [8]:
from spacy import displacy
doc = nlp(u"Apple is going to build a U.K. factory for $6 million.")
displacy.render(doc,style="dep",jupyter=True, options={'distance':100})

In [14]:
options={'distance':110,'compact':'True','color':'yellow','bg':'green','font':"Times"}
displacy.render(doc,style='dep',jupyter=True,options=options)

In [10]:
displacy.render(doc,style="ent",jupyter=True, options={'distance':110})

In [11]:
displacy.serve(doc,style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


### Stemming

In [7]:
import nltk
#Porter Stemmer is specifically designed for English stemming
from nltk.stem.porter import PorterStemmer
p_stemmer=PorterStemmer()

In [8]:
words=['run','runner','ran','runs','easily','fairly']
for word in words:
    print(word + '-------->' + p_stemmer.stem(word))

run-------->run
runner-------->runner
ran-------->ran
runs-------->run
easily-------->easili
fairly-------->fairli


In [9]:
from nltk.stem.snowball import SnowballStemmer
s_stemmer= SnowballStemmer(language="english")
for word in words:
    print(word + '------>' + s_stemmer.stem(word))

run------>run
runner------>runner
ran------>ran
runs------>run
easily------>easili
fairly------>fair


### lemmatization

In [6]:
import spacy
nlp=spacy.load('en_core_web_sm')
doc1=nlp(u"I am a runner running in a race because I love to run since I ran today")
for token in doc1:
    print(token.text,'\t',token.pos_,'\t',token.lemma_)

I 	 PRON 	 I
am 	 AUX 	 be
a 	 DET 	 a
runner 	 NOUN 	 runner
running 	 VERB 	 run
in 	 ADP 	 in
a 	 DET 	 a
race 	 NOUN 	 race
because 	 SCONJ 	 because
I 	 PRON 	 I
love 	 VERB 	 love
to 	 PART 	 to
run 	 VERB 	 run
since 	 SCONJ 	 since
I 	 PRON 	 I
ran 	 VERB 	 run
today 	 NOUN 	 today


In [15]:
def showlemmas(doc):
    for token in doc:
        print(f'{token.text:{12}} {token.pos_:{7}}{token.lemma_}')

In [16]:
doc2=nlp(u"I saw 10 mice today")
showlemmas(doc2)

I            PRON   I
saw          VERB   see
10           NUM    10
mice         NOUN   mouse
today        NOUN   today


### stop Words

In [27]:
import spacy
nlp=spacy.load('en_core_web_sm')
print(nlp.Defaults.stop_words)
len(nlp.Defaults.stop_words)

{'side', 'made', 'n‘t', 'four', 'ours', 'these', 'used', 'his', 'behind', 'hereby', 'of', 'whatever', 'bottom', 'regarding', 'was', 'whenever', 'with', 'from', 'always', 'over', 'whole', 'him', 'although', 'whereas', 'be', 'latterly', 'does', 'become', 'were', 'when', 'became', 'though', 'did', 'whereupon', 'forty', 'had', 'would', 'in', 'sometimes', 'should', 'our', '‘m', 'are', 'amongst', 'where', 'most', 'around', 'also', 'so', 'move', 'hence', 'no', 'to', 'many', 'becomes', 'other', 'seems', 'yourselves', 'upon', 'this', 'otherwise', 'per', 'beside', '’ve', 'whereafter', 'again', 'somehow', 'them', 'being', 'else', 'me', "'d", 'about', 'thereafter', 'nine', 'out', '‘d', 'along', 'anywhere', 'together', 'eleven', 'toward', "'ve", 'until', 'seem', 'has', 'itself', 'thru', 'her', '‘ve', 'hundred', 'it', 'ever', 'few', 'too', 'whose', 'see', 'myself', 'sixty', 'whence', 'twelve', 'top', 'mostly', 'across', 'take', 'they', 'least', 'one', 'ourselves', 'up', 'doing', 'towards', 'make', '

326

In [24]:
nlp.vocab['btw'].is_stop

True

In [30]:
#add a stop word
nlp.Defaults.stop_words.add('btw')
nlp.vocab['btw'].is_stop

True

In [28]:
len(nlp.Defaults.stop_words)

326

In [31]:
# remove a stop word
nlp.Defaults.stop_words.remove('btw')
nlp.vocab['btw'].is_stop

True

## POS Count

In [19]:
import spacy
nlp=spacy.load('en_core_web_sm')
doc=nlp(u"The quick brown fox jumped over the lazy dog's back")

for token in doc:
    print(token.text, token.pos_)

The DET
quick ADJ
brown ADJ
fox NOUN
jumped VERB
over ADP
the DET
lazy ADJ
dog NOUN
's PART
back NOUN


In [11]:
import spacy
nlp=spacy.load('en_core_web_sm')
doc=nlp(u"The quick brown fox jumped over the lazy dog's back")

speech = {}

for token in doc:
    try:
        speech[token.pos_] += 1
    except KeyError:
        speech[token.pos_] = 1

In [12]:
speech

{'DET': 2, 'ADJ': 3, 'NOUN': 3, 'VERB': 1, 'ADP': 1, 'PART': 1}

In [34]:
from collections import defaultdict
speech_default = defaultdict(float)

for token in doc:
    speech_default[token.pos_] += 1
    
speech_default

defaultdict(float,
            {'DET': 2.0,
             'ADJ': 3.0,
             'NOUN': 3.0,
             'VERB': 1.0,
             'ADP': 1.0,
             'PART': 1.0})

In [4]:
for token in doc:
    print(f"{token.text:{11}}{token.tag_:{5}}{spacy.explain(token.tag_):{20}}")

The        DT   determiner          
quick      JJ   adjective (English), other noun-modifier (Chinese)
brown      JJ   adjective (English), other noun-modifier (Chinese)
fox        NN   noun, singular or mass
jumped     VBD  verb, past tense    
over       IN   conjunction, subordinating or preposition
the        DT   determiner          
lazy       JJ   adjective (English), other noun-modifier (Chinese)
dog        NN   noun, singular or mass
's         POS  possessive ending   
back       NN   noun, singular or mass


### POS counting and tagging

In [13]:
import spacy
nlp=spacy.load('en_core_web_sm')
doc=nlp(u"The quick brown fox jumped over the lazy dog's back")

In [14]:
print(doc[4].tag_)

VBD


In [16]:
spacy.explain(doc[4].tag_)

'verb, past tense'

In [17]:
# Print the fifth word and associated tags:
print(doc[4].text, doc[4].pos_, doc[4].tag_, spacy.explain(doc[4].tag_))

jumped VERB VBD verb, past tense


In [24]:
for token in doc:
    print(f"{token.text:{15}} {token.pos_:{7}} {token.tag_:{5}} {spacy.explain(token.tag_)}")

The             DET     DT    determiner
quick           ADJ     JJ    adjective (English), other noun-modifier (Chinese)
brown           ADJ     JJ    adjective (English), other noun-modifier (Chinese)
fox             NOUN    NN    noun, singular or mass
jumped          VERB    VBD   verb, past tense
over            ADP     IN    conjunction, subordinating or preposition
the             DET     DT    determiner
lazy            ADJ     JJ    adjective (English), other noun-modifier (Chinese)
dog             NOUN    NN    noun, singular or mass
's              PART    POS   possessive ending
back            NOUN    NN    noun, singular or mass


In [26]:
POS_counts= doc.count_by(spacy.attrs.POS)
POS_counts

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1}

In [21]:
print(doc.vocab[84].text)
print(doc[2].pos_)
print(doc[2].pos)

ADJ
ADJ
84


In [22]:
POS_counts.items()

dict_items([(90, 2), (84, 3), (92, 3), (100, 1), (85, 1), (94, 1)])

In [23]:
for k,v in sorted(POS_counts.items()):
    print(f"{k:{5}} {doc.vocab[k].text :{5}} {v}")

   84 ADJ   3
   85 ADP   1
   90 DET   2
   92 NOUN  3
   94 PART  1
  100 VERB  1


In [24]:
# Lets make a dictionary of POS fine grain POS tags
TAG_counts= doc.count_by(spacy.attrs.TAG)
for k,v in sorted(TAG_counts.items()):
    print(f"{k:{25}} {doc.vocab[k].text :{4}} {spacy.explain(doc.vocab[k].text):{60}} {v}")

                       74 POS  possessive ending                                            1
      1292078113972184607 IN   conjunction, subordinating or preposition                    1
     10554686591937588953 JJ   adjective (English), other noun-modifier (Chinese)           3
     15267657372422890137 DT   determiner                                                   2
     15308085513773655218 NN   noun, singular or mass                                       3
     17109001835818727656 VBD  verb, past tense                                             1


In [25]:
#SYNTACTIC DEPENDENCY
DEP_counts= doc.count_by(spacy.attrs.DEP)
for k,v in sorted(DEP_counts.items()):
    print(f"{k:{25}} {doc.vocab[k].text :{7}} {spacy.explain(doc.vocab[k].text):{40}} {v}")

                      400 advmod  adverbial modifier                       1
                      402 amod    adjectival modifier                      3
                      415 det     determiner                               2
                      429 nsubj   nominal subject                          1
                      439 pobj    object of preposition                    1
                      443 prep    prepositional modifier                   1
      8110129090154140942 case    case marking                             1
      8206900633647566924 ROOT    root                                     1
