In [1]:
#CheatSheet from BlackBox 

In [2]:
import spacy 

nlp = spacy.load("en_core_web_sm")
doc = nlp('Apple is looking at buying a U.K. startup for $1 billion')

for token in doc:
    print(token.text)

Apple
is
looking
at
buying
a
U.K.
startup
for
$
1
billion


In [3]:
#Find occurences of a term in a document

for entity in doc.ents:
    print(entity.text, entity.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [4]:
#Understating POS Tagging

for token in doc:
    print(token.text, token.pos_, token.tag_,token.dep_,token.shape_,token.is_alpha, token.is_stop)

Apple PROPN NNP nsubj Xxxxx True False
is AUX VBZ aux xx True True
looking VERB VBG ROOT xxxx True False
at ADP IN prep xx True True
buying VERB VBG pcomp xxxx True False
a DET DT det x True True
U.K. PROPN NNP compound X.X. False False
startup NOUN NN dobj xxxx True False
for ADP IN prep xxx True True
$ SYM $ quantmod $ False False
1 NUM CD compound d False False
billion NUM CD pobj xxxx True False


In [5]:
#Extracting Named Entities 

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 29 33 GPE
$1 billion 46 56 MONEY


In [6]:
#Navigating and Searching Semantic Dependencies:

for token in doc: 
    print(token.text, token.dep_, token.head.text, token.head.pos_)


Apple nsubj looking VERB
is aux looking VERB
looking ROOT looking VERB
at prep looking VERB
buying pcomp at ADP
a det startup NOUN
U.K. compound startup NOUN
startup dobj buying VERB
for prep startup NOUN
$ quantmod billion NUM
1 compound billion NUM
billion pobj for ADP


In [7]:
#Word Vector Representations

print(doc[0].vector_norm)
print(doc[0].similarity(doc[1]))
print(doc[0].orth)

8.649502
0.03433087095618248
6418411030699964375


  print(doc[0].similarity(doc[1]))


In [8]:
#Parsing and Extracting Information

#Noun phrases
for np in doc.noun_chunks:
    print(np.text)


#Subject, Verb, Object(SVO) triples
for token in doc:
    if token.dep_ =='nsubj':
        print("Subject :", token.text)
    elif token.dep_ == 'dobj':
        print("Object : ", token.text)

#Token attributes

print([(token.text, token.tag_,token.head.text, token.dep_) for token in doc])                        

    
        

Apple
a U.K. startup
Subject : Apple
Object :  startup
[('Apple', 'NNP', 'looking', 'nsubj'), ('is', 'VBZ', 'looking', 'aux'), ('looking', 'VBG', 'looking', 'ROOT'), ('at', 'IN', 'looking', 'prep'), ('buying', 'VBG', 'at', 'pcomp'), ('a', 'DT', 'startup', 'det'), ('U.K.', 'NNP', 'startup', 'compound'), ('startup', 'NN', 'buying', 'dobj'), ('for', 'IN', 'startup', 'prep'), ('$', '$', 'billion', 'quantmod'), ('1', 'CD', 'billion', 'compound'), ('billion', 'CD', 'for', 'pobj')]


In [9]:
#Syntactic Parsing

for token in doc:
    print(token.text, token.head.text, token.dep_, token.children)

Apple looking nsubj <generator object at 0x000001B74E961AB0>
is looking aux <generator object at 0x000001B74E961AB0>
looking looking ROOT <generator object at 0x000001B74E961AB0>
at looking prep <generator object at 0x000001B74E961AB0>
buying at pcomp <generator object at 0x000001B74E961AB0>
a startup det <generator object at 0x000001B74E961AB0>
U.K. startup compound <generator object at 0x000001B74E961AB0>
startup buying dobj <generator object at 0x000001B74E961AB0>
for startup prep <generator object at 0x000001B74E961AB0>
$ billion quantmod <generator object at 0x000001B74E961AB0>
1 billion compound <generator object at 0x000001B74E961AB0>
billion for pobj <generator object at 0x000001B74E961AB0>
