In [1]:
import spacy
import en_core_web_sm

In [2]:
nlp= en_core_web_sm.load()

In [3]:
#create a document object
doc=nlp(u'tesla is looking at buying US startup for $6 millions')

In [4]:
for token in doc:
    print(token.text)

tesla
is
looking
at
buying
US
startup
for
$
6
millions


In [5]:
for token in doc:
    print(token.text,token.pos)   #show part of the speech

tesla 96
is 87
looking 100
at 85
buying 100
US 96
startup 92
for 85
$ 99
6 93
millions 92


In [6]:
for token in doc:
    print(token.text,token.pos_)  #展示词性

tesla PROPN
is AUX
looking VERB
at ADP
buying VERB
US PROPN
startup NOUN
for ADP
$ SYM
6 NUM
millions NOUN


In [7]:
for token in doc:
    print(token.text,token.pos_,token.dep_)  #sytactic dependency

tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
US PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM nummod
millions NOUN pobj


In [8]:
#pipeline including the tagging,parsering and processing the data
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x1211d5c50>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x12138b0a8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x12138b108>)]

In [9]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [10]:
doc2=nlp(u"tesla isn't looking into startup anymore.")

In [11]:
for token in doc2:
    print(token.text,token.pos_,token.dep_)

tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startup NOUN pobj
anymore ADV advmod
. PUNCT punct


In [12]:
doc[0]

tesla

In [13]:
doc[0].pos_  # the property of dependencies 

'PROPN'

In [14]:
doc4=nlp(u"This is the first sentence. This is another sentence. This is the last sentence!")

In [15]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence!


In [16]:
doc4[6]

This

In [17]:
doc4[6].is_sent_start

True

In [18]:
doc4[8].is_sent_start  #return none, not return false

### tokenization
tokenization is a process of breaking up the original text into component pieces(tokens)

In [19]:
mystring=' "we\'re moving to L.A. !" '
mystring

' "we\'re moving to L.A. !" '

In [20]:
print(mystring)

 "we're moving to L.A. !" 


In [21]:
doc=nlp(mystring)

In [22]:
for token in doc:
    print(token.text)

 
"
we
're
moving
to
L.A.
!
"


In [23]:
doc3=nlp(u'we\'re here to help ! send snail-email, email support@oursite.com or visit us at http://oursite.com !')

In [24]:
for token in doc3:
    print(token.text)

we
're
here
to
help
!
send
snail
-
email
,
email
support@oursite.com
or
visit
us
at
http://oursite.com
!


In [25]:
len(doc)

9

##### vocab
Vocab is a python package that provides vocabulary objects for natural language processing.

In [26]:
doc3.vocab

<spacy.vocab.Vocab at 0x120077bc8>

In [27]:
len(doc3.vocab)

513

##### name entities
tell what is it from the name of the word 

In [28]:
doc6=nlp(u'Apple to build a Hong Kong factory for $6 million')

In [29]:
for token in doc6:
    print(token.text,end=' - ')

Apple - to - build - a - Hong - Kong - factory - for - $ - 6 - million - 

In [30]:
for token in doc6:
    print(token,end=' - ')  #it seems it is the same token and token.text

Apple - to - build - a - Hong - Kong - factory - for - $ - 6 - million - 

In [31]:
for entity in doc6.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))  #print the explanation for the label
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




##### noun_chunks

In [32]:
doc7=nlp(u'Autonomous cars shift insurance liability toward manufacturers ')

In [33]:
for chunk in doc7.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


### visualize the tokens

In [34]:
from spacy import displacy

In [35]:
doc8=nlp(u'Apple is going to build a U.K factory for $6 million ')

In [36]:
displacy.render(doc8,style='dep',jupyter=True,options={'distance':80})

In [37]:
displacy.render(doc8,style='ent',jupyter=True)

### stemming
for example, boat, boating,boater and boats
spacy does not include stemmizationg, but lemmazation

In [38]:
import nltk

In [39]:
from nltk.stem.porter import PorterStemmer

In [40]:
p_stemmer= PorterStemmer()

### lemmatization

In [41]:
doc10=nlp(u'i am a runner running in a race because i love to run since i ran today')

In [43]:
for token in doc10:
    print(token.text,'\t',token.pos_,'\t',token.lemma,'\t',token.lemma_)  #token.lemma will give you the hash code

i 	 PRON 	 5097672513440128799 	 i
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
i 	 PRON 	 5097672513440128799 	 i
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
i 	 PRON 	 5097672513440128799 	 i
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [55]:
#create a function to let it look nicely: formatting 
def show_lemma(text):
    for token in text:
        print(f"{token.text:{12}} {token.pos_:{6}} {token.lemma:{22}} {token.lemma_}")

In [56]:
doc12=nlp(u'i saw ten mice today')

In [57]:
 show_lemma(doc12)

i            PRON      5097672513440128799 i
saw          VERB     11925638236994514241 see
ten          NUM       7970704286052693043 ten
mice         NOUN      1384165645700560590 mouse
today        NOUN     11042482332948150395 today


### stop words

In [58]:
print(nlp.Defaults.stop_words)

{'here', 'am', 'ever', 'last', 'own', 'now', 'show', 'whether', 'almost', 'again', 'why', 'next', 'its', 'keep', 'of', 'i', 'in', 'both', 'seeming', 'toward', 'among', 'might', 'further', 'his', 'our', 'empty', 'beforehand', 'be', 'four', 'while', 'either', 'really', 'whereby', 'seems', 'therefore', 'already', 'whereupon', 'call', 'must', 'former', 'us', 'we', 'another', 'after', 'none', 'behind', 'hereafter', "'ve", 'were', 'forty', '’s', 'fifteen', 'up', 'become', 'who', 'or', 'at', 'someone', 'they', 'meanwhile', 'been', '‘s', 'about', 'amount', 'until', 'himself', 'amongst', 'will', 'make', 'everything', 'front', 'nowhere', 'sometimes', 'thereupon', 'thus', 'five', 'towards', 'before', 'however', 'neither', 'if', 'thence', 'just', 'much', 'this', 'yet', 'their', 'most', 'eleven', '’ve', 'for', 'upon', 'seemed', 'anyone', 'regarding', 'using', 'when', 'less', "n't", 'also', 'too', 'every', 'between', 'onto', "'d", "'re", 'was', 'give', 'three', 'became', 'should', 'name', 'then', 's

In [59]:
len(nlp.Defaults.stop_words)  #there are 326 stop words 

326

In [60]:
nlp.vocab['mystery'].is_stop

False

In [61]:
nlp.vocab['is'].is_stop

True

##### add stop words in your dataset

In [62]:
nlp.Defaults.stop_words.add('btw')

In [63]:
nlp.vocab['btw'].is_stop

True

In [64]:
len(nlp.Defaults.stop_words)

327

##### remove a stop word

In [65]:
nlp.Defaults.stop_words.remove('someone')

In [66]:
nlp.vocab['someone'].is_stop

False

In [67]:
len(nlp.Defaults.stop_words)

326

### vacabulary and matching with spacy
we can think of this as a more powerful version of regular expression

In [68]:
from spacy.matcher import Matcher

In [70]:
matcher=Matcher(nlp.vocab)

In [71]:
#create a pattern that we want to match on
#SolarPower 
#Solar-power
#Solar power
pattern1=[{'LOWER':'solarpower'}]
pattern2=[{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]
pattern3=[{'LOWER':'solar'},{'LOWER':'power'}]

In [72]:
#besides lemma, there is a bunch of attributes we can use to determine matching rules

In [75]:
matcher.add('SolarPower',None,pattern1,pattern2,pattern3)  #it means that is solarpower is in the next three patterns, it can be identifed as solar power 

In [76]:
doc12=nlp(u'The Solar Power industry cintinuously grows as solarpower increases. Solar-power is ')

In [80]:
found_matches=matcher(doc12)

In [81]:
for match_id,start,end in found_matches:
    string_id=nlp.vocab.strings[match_id]
    span=doc12[start:end]
    print(match_id,string_id,start,end,span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 7 8 solarpower
8656102463236116519 SolarPower 10 13 Solar-power


##### remove the set of patterns

In [83]:
matcher.remove('SolarPower')

In [None]:
pattern1=[{'LOWER':'solarpower'}]
pattern2=[{'LOWER':'solar'},{'IS_PUNCT':True},{'OP':'*'}]

In [85]:
matcher.add('SolarPower',None,pattern1,pattern2)

In [86]:
doc13=nlp(u"Solar--power is solar power yah")

In [87]:
found_matches=matcher(doc13)

In [88]:
for match_id,start,end in found_matches:
    string_id=nlp.vocab.strings[match_id]
    span=doc13[start:end]
    print(match_id,string_id,start,end,span.text)

8656102463236116519 SolarPower 0 3 Solar--power
