In [2]:
import spacy
nlp = spacy.load('en_core_web_md')

In [10]:
doc = nlp("Alicia and me went to the school by bus.")
for token in doc:
    print(token.text, token.pos_, token.tag_, spacy.explain(token.pos_),
    spacy.explain(token.tag_))

Alicia PROPN NNP proper noun noun, proper singular
and CCONJ CC coordinating conjunction conjunction, coordinating
me PRON PRP pronoun pronoun, personal
went VERB VBD verb verb, past tense
to ADP IN adposition conjunction, subordinating or preposition
the DET DT determiner determiner
school NOUN NN noun noun, singular or mass
by ADP IN adposition conjunction, subordinating or preposition
bus NOUN NN noun noun, singular or mass
. PUNCT . punctuation punctuation mark, sentence closer


In [12]:
doc = nlp("My friend will fly to the New York fast and she is staying there for 3 days.")
for token in doc:
    print( token.text, token.pos_, token.tag_,
         spacy.explain(token.pos_), spacy.explain(token.tag_))

My PRON PRP$ pronoun pronoun, possessive
friend NOUN NN noun noun, singular or mass
will AUX MD auxiliary verb, modal auxiliary
fly VERB VB verb verb, base form
to ADP IN adposition conjunction, subordinating or preposition
the DET DT determiner determiner
New PROPN NNP proper noun noun, proper singular
York PROPN NNP proper noun noun, proper singular
fast ADV RB adverb adverb
and CCONJ CC coordinating conjunction conjunction, coordinating
she PRON PRP pronoun pronoun, personal
is AUX VBZ auxiliary verb, 3rd person singular present
staying VERB VBG verb verb, gerund or present participle
there ADV RB adverb adverb
for ADP IN adposition conjunction, subordinating or preposition
3 NUM CD numeral cardinal number
days NOUN NNS noun noun, plural
. PUNCT . punctuation punctuation mark, sentence closer


In [13]:
doc = nlp("I will ship the package tomorrow.")
for token in doc:
    print( token.text, token.tag_, spacy.explain(token.tag_))

I PRP pronoun, personal
will MD verb, modal auxiliary
ship VB verb, base form
the DT determiner
package NN noun, singular or mass
tomorrow NN noun, singular or mass
. . punctuation mark, sentence closer


In [14]:
doc = nlp("I saw a red ship.")
for token in doc:
    print(token.text, token.tag_, spacy.explain(token.tag_))

I PRP pronoun, personal
saw VBD verb, past tense
a DT determiner
red JJ adjective (English), other noun-modifier (Chinese)
ship NN noun, singular or mass
. . punctuation mark, sentence closer


In [16]:
doc = nlp("My cat will fish for a fish tomorrow in a fishy way.")
for token in doc:
    print(token.text, token.pos_, token.tag_,
         spacy.explain(token.pos_), spacy.explain(token.tag_))

My PRON PRP$ pronoun pronoun, possessive
cat NOUN NN noun noun, singular or mass
will AUX MD auxiliary verb, modal auxiliary
fish VERB VB verb verb, base form
for ADP IN adposition conjunction, subordinating or preposition
a DET DT determiner determiner
fish NOUN NN noun noun, singular or mass
tomorrow NOUN NN noun noun, singular or mass
in ADP IN adposition conjunction, subordinating or preposition
a DET DT determiner determiner
fishy ADJ JJ adjective adjective (English), other noun-modifier (Chinese)
way NOUN NN noun noun, singular or mass
. PUNCT . punctuation punctuation mark, sentence closer


Word-sense Disambiguation

In [19]:
sent1 = "I flew to Rome"
sent2 = "I'm flying to Rome."
sent3 = "I will fly to Rome."
doc1 = nlp(sent1)
doc2 = nlp(sent2)
doc3 = nlp(sent3)
for doc in [doc1, doc2, doc3]:
    print([(w.text, w.lemma_) for w in doc if
         w.tag_ == 'VBG' or w.tag_ == 'VB'])

[]
[('flying', 'fly')]
[('fly', 'fly')]


In [None]:
Understanding number, symbol and punctuation tags

In [23]:
doc = nlp("He earned $5.5 million in 2020 and paid 35 percent tax.")
for token in doc:
    print(token.text, token.tag_, spacy.explain(token.tag_))

He PRP pronoun, personal
earned VBD verb, past tense
$ $ symbol, currency
5.5 CD cardinal number
million CD cardinal number
in IN conjunction, subordinating or preposition
2020 CD cardinal number
and CC conjunction, coordinating
paid VBD verb, past tense
35 CD cardinal number
percent NN noun, singular or mass
tax NN noun, singular or mass
. . punctuation mark, sentence closer


Introduction to depandency parsing

In [24]:
doc = nlp("blue flower")
for token in doc:
    print(token.text, token.dep_)

blue amod
flower ROOT


In [26]:
doc = nlp("I counted white sheep.")
for token in doc:
    print(token.text, token.pos_, token.dep_, spacy.explain(token.dep_))

I PRON nsubj nominal subject
counted VERB ROOT None
white ADJ amod adjectival modifier
sheep NOUN dobj direct object
. PUNCT punct punctuation


In [29]:
doc = nlp("I counted white sheep.")
for token in doc:
    print(token.text, token.tag_, token.dep_, token.head)

I PRP nsubj counted
counted VBD ROOT counted
white JJ amod sheep
sheep NNS dobj counted
. . punct counted


In [31]:
doc = nlp("we are trying to understand the difference.")
for token in doc:
    print(token.text, token.tag_, token.dep_, token.head)

we PRP nsubj trying
are VBP aux trying
trying VBG ROOT trying
to TO aux understand
understand VB xcomp trying
the DT det difference
difference NN dobj understand
. . punct trying


In [32]:
doc = nlp("Queen Katherine, who was the mother of Mary Tudor, died at 1536.")
for token in doc:
    print(token.text, token.tag_, token.dep_, token.head)

Queen NNP compound Katherine
Katherine NNP nsubj died
, , punct Katherine
who WP nsubj was
was VBD relcl Katherine
the DT det mother
mother NN attr was
of IN prep mother
Mary NNP compound Tudor
Tudor NNP pobj of
, , punct Katherine
died VBD ROOT died
at IN prep died
1536 CD pobj at
. . punct died


In [None]:
Introducing NER

In [33]:
doc = nlp("The president Biden visited France.")
doc.ents

(Biden, France)

In [34]:
spacy.explain("ORG")

'Companies, agencies, institutions, etc.'

In [35]:
doc = nlp("He worked for NASA.")
token = doc[3]
token.ent_type_, spacy.explain(token.ent_type_)

('ORG', 'Companies, agencies, institutions, etc.')

In [37]:
doc = nlp("Albert Einstein was born in Ulm on 1879. He studied electronical engineering at ETH Zurich.")

for token in doc:
    print(token.text, token.ent_type_, spacy.explain(token.ent_type_))

Albert PERSON People, including fictional
Einstein PERSON People, including fictional
was  None
born  None
in  None
Ulm GPE Countries, cities, states
on  None
1879 DATE Absolute or relative dates or periods
.  None
He  None
studied  None
electronical  None
engineering  None
at  None
ETH ORG Companies, agencies, institutions, etc.
Zurich ORG Companies, agencies, institutions, etc.
.  None


In [38]:
doc = nlp("Jean-Michel Basquiat was an American artist of Haitian and Puerto Rican descent who gained fame with his graffiti and street art work.")
doc.ents

(Jean-Michel Basquiat, American, Haitian, Puerto Rican)

In [39]:
for ent in doc.ents:
    print(ent, ent.label_, spacy.explain(ent.label_))

Jean-Michel Basquiat PERSON People, including fictional
American NORP Nationalities or religious or political groups
Haitian NORP Nationalities or religious or political groups
Puerto Rican NORP Nationalities or religious or political groups


A real world Example

In [3]:
from bs4 import BeautifulSoup
import requests
import spacy
import html5lib

In [None]:
!

In [5]:
def url_text(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    text = soup.get_text()
    return " ".join(text.split())

ny_art = url_text("https://www.nytimes.com/2021/01/12/opinion/trump-america-allies.html")

In [6]:
nlp = spacy.load('en_core_web_md')
doc = nlp(ny_art)

In [7]:
len(doc.ents)

122

In [8]:
from collections import Counter

labels = [ent.label_ for ent in doc.ents]
Counter(labels)

Counter({'GPE': 37,
         'ORG': 15,
         'DATE': 13,
         'PERSON': 23,
         'LANGUAGE': 1,
         'NORP': 22,
         'FAC': 3,
         'EVENT': 1,
         'TIME': 1,
         'CARDINAL': 2,
         'LOC': 2,
         'WORK_OF_ART': 2})

In [9]:
items = [ent.text for ent in doc.ents]
Counter(items).most_common(10)

[('America', 14),
 ('American', 7),
 ('China', 6),
 ('Trump', 5),
 ('Biden', 5),
 ('Capitol', 4),
 ('the United States', 3),
 ('Washington', 3),
 ('Europeans', 3),
 ('Americans', 3)]

In [10]:
print(doc.ents)

(America, contentSkip, America, U.S., U.S., Jan. 12, Rome, Donald Tramp, Thursday, Andrew Medichini, Krastev, Trump, America, America, Russian, Chinese, Iranian, Jan. 6, Capitol, Ukraine, Georgia, American, American, the United States, Trump, American, Congress, Civil War, 19th-century, German, Otto von Bismarck, the United States of America, America, Capitol, Trump, his last hours, American, American, Washington, Washington, Capitol, America, just weeks, America, Russia, at least 10, Four years, Trump, Joe Biden, two, American-European, China, Biden, America, Recep Tayyip Erdogan, Turkey, Jair Bolsonaro, Brazil, Washington, Russia, China, Sullivan, Biden, trans-Atlantic, China, Just a week ago, European, Sullivan, Europe, America, China, Biden, Europeans, China, German, Chinese, the European Union’s, America, Christophe Ena, the European Council on Foreign Relations, the weeks, American, the day, Biden, Europeans, America, the next 10 years, China, the United States, Germans, Trump, A

In [11]:
#visualize
from spacy import displacy
displacy.render(doc, style = "ent", jupyter = True)

Marging and splitting tokens

In [12]:
doc = nlp("She live in New Hampshire.")
doc.ents

(New Hampshire,)

In [13]:
[(token.text, token.i) for token in doc]

[('She', 0), ('live', 1), ('in', 2), ('New', 3), ('Hampshire', 4), ('.', 5)]

In [14]:
len(doc)

6

In [22]:
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[3:5], attrs={"LEMMA": "new hampshire"}) #attrs is a attributes of token like lemma, pos, tag, ent_type


In [23]:
[(token.text, token.i) for token in doc]

[('She', 0), ('live', 1), ('in', 2), ('New Hampshire.', 3)]

In [24]:
len(doc)

4

In [25]:
doc.ents

(New Hampshire.,)

In [26]:
[(token.lemma_) for token in doc]

['she', 'live', 'in', 'new hampshire']

In [27]:
doc = nlp("She lived in NewHampshire.")
len(doc)

5

In [28]:
[(token.text, token.lemma_, token.i) for token in doc]

[('She', 'she', 0),
 ('lived', 'live', 1),
 ('in', 'in', 2),
 ('NewHampshire', 'NewHampshire', 3),
 ('.', '.', 4)]

In [33]:
for token in doc:
    print([token.text, token.pos_, token.tag_, token.dep_])

['She', 'PRON', 'PRP', 'nsubj']
['lived', 'VERB', 'VBD', 'ROOT']
['in', 'ADP', 'IN', 'prep']
['NewHampshire', 'PROPN', 'NNP', 'pobj']
['.', 'PUNCT', '.', 'punct']


In [36]:
with doc.retokenize() as retokenizer:
    heads = [(doc[3], 1), doc[2]]
    attrs = {"TAG":["NNP", "NNP"],
            "DEP": ["compound", "pobj"]}
    retokenizer.split(doc[3], ["New", "Hampshire"], heads=heads, attrs=attrs)

In [37]:
[(token.text, token.lemma_, token.i) for token in doc]

[('She', 'she', 0),
 ('lived', 'live', 1),
 ('in', 'in', 2),
 ('New', 'New', 3),
 ('Hampshire', 'Hampshire', 4),
 ('.', '.', 5)]

In [38]:
for token in doc:
    print(token.text, token.pos_, token.tag_, token.dep_)

She PRON PRP nsubj
lived VERB VBD ROOT
in ADP IN prep
New PROPN NNP compound
Hampshire PUNCT NNP pobj
. PUNCT . punct
