In [1]:
# increase the cell width 
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; } </style>"))   
from simple_colors import *

### <a href='https://spacy.io/models' style='text-decoration:none'>Spacy model</a>
<a href='https://spacy.io/usage/models' style='text-decoration:none'>A list of models</a> <br>
A spaCy model consists of three components: 
- The weights, i.e. binary data loaded in from a directory
- A pipeline of functions called in order
- Language data like the tokenization rules and annotation scheme

In [2]:
# !python -m pip uninstall -y spacy
# !conda install spacy --yes          #fix issue 'No module named 'spacy.symbols'', https://github.com/explosion/spaCy/issues/3791
import sys
sys.path.insert(-1, '/xdisk/msurdeanu/fanluo/miniconda3/lib/python3.7/site-packages') 
import spacy   
print(spacy.__version__)
from prettytable import PrettyTable
from IPython.display import IFrame

2.3.5


#### Load model

When loading pretrained models: 
- The model’s meta.json tells spaCy to use the language "en" and the pipeline ["tagger", "parser", "ner"]. 
- spaCy initializes a spacy.lang.en.English instance, and creates each pipeline component and add it to the processing pipeline. 
- It’ll then load in the model’s data and return the modified Language class as the nlp object.

In [3]:
# !python -m spacy download en 
# from spacy.lang.en import English
# nlp = English()

# !python -m spacy download en_core_web_sm     # download model
# import en_core_web_sm                        # a small English model trained on written web text 
# nlp = en_core_web_sm.load()                  # load pretrained models 

#!python -m spacy download en_core_web_lg 
import en_core_web_lg                         
nlp = en_core_web_lg.load()

In [4]:
nlp.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

#### Processe text
- Tokenize 
- Each pipeline component access to the model to assign annotations to the Doc object

In [5]:
doc = nlp("This is a text")                  

In [6]:
text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types systems and systems of mixed types."

doc = nlp(text)
for sent in doc.sents:
    print(sent.start, sent.end)

0 13
13 33
33 61
61 91


##### Process large volumes of text

In [7]:
# To process large volumes of text
texts = ["This is a text", "These are lots of texts", "..."]  
docs = list(nlp.pipe(texts))                    # much faster than docs = [nlp(text) for text in texts] 

##### Process text with context

In [8]:
data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    # Print the text and custom attribute data
    print(doc.text, context["id"], context["page_number"])

This is a text 1 15
And another text 2 16


In [9]:
from spacy.tokens import Doc
Doc.set_extension("id", default=None)
Doc.set_extension("page_number", default=None)

data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    # Set the attributes from the context
    doc._.id = context["id"]
    doc._.page_number = context["page_number"]

    # Print the text and custom attribute data
    print(doc.text, doc._.id, doc._.page_number)

This is a text 1 15
And another text 2 16


### <a href='https://spacy.io/usage/linguistic-features' style='text-decoration:none'>Linguistic Features</a>

#### Doc, Token,  Span, and Sents

##### Doc

In [10]:
doc = nlp("Lily ate the pizza this afternoon.")  # Process the text: Tokenize and apply pipeline components 

In [11]:
IFrame('https://spacy.io/pipeline-7a14d4edd18f3edfee8f34393bff2992.svg', width=600, height=150) 

In [12]:
# Only tokenize and return a doc object
doc = nlp.make_doc("Lily ate the pizza this afternoon.")

In [13]:
# Create a doc object manually
words = ["Lily", "ate", "the", "pizza", "this", "afternoon", "."]
spaces = [True, True, True, True, True, False, False]         # is there a space after each word
doc = Doc(nlp.vocab, words=words, spaces=spaces)

##### Token
Token and Span objects are created lazily, and don’t own any data. 

In [14]:
for token in doc:
    print(token.i, "th token: ", token.text)

0 th token:  Lily
1 th token:  ate
2 th token:  the
3 th token:  pizza
4 th token:  this
5 th token:  afternoon
6 th token:  .


##### Span
a view of doc, itself does not contain data 

In [14]:
# Two ways to create a span
from spacy.tokens import Span
span = Span(doc, 2, 4)
print(span.text)

span = doc[2:4]      
print(span.text)
print(span[1].text)

the pizza
the pizza
pizza


In [15]:
IFrame('https://pasteboard.co/JtQDWNr.png', width=400, height=230)

In [16]:
# create a span with a label
span = Span(doc, 2, 4, label="Food")
print(span.text, span.label_)

# Add span to the doc.ents
doc.ents = list(doc.ents) + [span]
# Print entities' text and labels
print([(ent.text, ent.label_) for ent in doc.ents])

the pizza Food
[('the pizza', 'Food')]


In [17]:
# merge span and retokenize
with doc.retokenize() as retokenizer:
    retokenizer.merge(span)
    
for token in doc:
    print(token.i, "th token: ", token.text)

0 th token:  Lily
1 th token:  ate
2 th token:  the pizza
3 th token:  this
4 th token:  afternoon
5 th token:  .


##### Sentences

In [18]:
text = """In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus. A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom. One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates). One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero. Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin)."""
doc = nlp(text)
list(doc.sents)   # A list of sentences

[In ancient Rome, some neighbors live in three adjacent houses.,
 In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus.,
 A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom.,
 One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates).,
 One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero.,
 Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin).]

#### <a href='https://spacy.io/api/token#attributes' style='text-decoration:none'>Token attributes</a>

In [19]:
# doc = nlp("Credit and mortgage account holders must submit their requests.")  
doc = nlp("Alphabet, Facebook, Apple and Amazon reported a combined $28 billion in profits on Thursday.")

In [20]:
# Text: The original word text.
# Lemma: The base form of the word.
# Shape: The word shape – capitalization, punctuation, digits 
# Lower: Lowercase form of the token text. Equivalent to token.text.lower()
# idx: The character offset of the token 
# is_sent_start:  whether the token starts a sentence
# is_title: Is the token in titlecase? Equivalent to token.text.istitle().
# is_alpha: Is the token an alpha character?
# like_num: Does the token represent a number? e.g. “10.9”, “10”, “ten”, etc.
# is_punct: Is the token a punctuation? 
# is_stop: Is the token part of a stop list?
# Sentiment: positivity or negativity of the token
table = PrettyTable()  
table.field_names = ["TEXT", "LEMMA", "SHAPE", "LOWER", "IDX", "SENT START", "TITLECASE", "ALPHA", "NUM", "PUNCT", "STOP"]    
for token in doc: 
    table.add_row([token.text, token.lemma_, token.shape_, token.lower_, token.idx, token.is_sent_start, token.is_title, token.is_alpha, token.like_num, token.is_punct, token.is_stop])
print(table) 

+----------+----------+-------+----------+-----+------------+-----------+-------+-------+-------+-------+
|   TEXT   |  LEMMA   | SHAPE |  LOWER   | IDX | SENT START | TITLECASE | ALPHA |  NUM  | PUNCT |  STOP |
+----------+----------+-------+----------+-----+------------+-----------+-------+-------+-------+-------+
| Alphabet | Alphabet | Xxxxx | alphabet |  0  |    True    |    True   |  True | False | False | False |
|    ,     |    ,     |   ,   |    ,     |  8  |    None    |   False   | False | False |  True | False |
| Facebook | Facebook | Xxxxx | facebook |  10 |    None    |    True   |  True | False | False | False |
|    ,     |    ,     |   ,   |    ,     |  18 |    None    |   False   | False | False |  True | False |
|  Apple   |  Apple   | Xxxxx |  apple   |  20 |    None    |    True   |  True | False | False | False |
|   and    |   and    |  xxx  |   and    |  26 |    None    |   False   |  True | False | False |  True |
|  Amazon  |  Amazon  | Xxxxx |  amazon  |  30

##### Part-of-speech tag
POS: <a href='https://spacy.io/api/annotation#pos-universal' style='text-decoration:none'>Universal Part-of-speech Tags</a> (UPOS) <br>
Tag: <a href='https://spacy.io/api/annotation#pos-en' style='text-decoration:none'>OntoNotes 5 version of the Penn Treebank tag set</a>: more detail, including info of MORPHOLOGY

In [21]:
# POS: The simple  part-of-speech tag.
# Tag: The detailed part-of-speech tag.
table = PrettyTable()
table.field_names = ["TEXT", "POS", "TAG"]
for token in doc: 
    table.add_row([token.text, token.pos_, token.tag_])
print(table) 

+----------+-------+-----+
|   TEXT   |  POS  | TAG |
+----------+-------+-----+
| Alphabet | PROPN | NNP |
|    ,     | PUNCT |  ,  |
| Facebook | PROPN | NNP |
|    ,     | PUNCT |  ,  |
|  Apple   | PROPN | NNP |
|   and    | CCONJ |  CC |
|  Amazon  | PROPN | NNP |
| reported |  VERB | VBD |
|    a     |  DET  |  DT |
| combined |  ADJ  |  JJ |
|    $     |  SYM  |  $  |
|    28    |  NUM  |  CD |
| billion  |  NUM  |  CD |
|    in    |  ADP  |  IN |
| profits  |  NOUN | NNS |
|    on    |  ADP  |  IN |
| Thursday | PROPN | NNP |
|    .     | PUNCT |  .  |
+----------+-------+-----+


##### Syntactic dependency
<a href='https://spacy.io/api/annotation#dependency-parsing-universal' style='text-decoration:none'>Universal Dependency Labels</a> <br>
<a href='https://spacy.io/api/annotation#dependency-parsing-english' style='text-decoration:none'>English Dependency Labels</a>, used by the *en_core_web_sm* model

In [22]:
# Dep: Syntactic dependency relation of (token, its Head); ROOT means its head is itself.
# HEAD: Syntactic head, every word has exactly one head.
# ANCESTORS: Token's direct and indirect ancestors

table = PrettyTable()
table.field_names = ["TEXT", "DEP", "HEAD", "ANCESTORS"]  
for token in doc: 
    table.add_row( [token.text, token.dep_, token.head.text, ','.join([ancestor.text for ancestor in token.ancestors]) ])
print(table) 

+----------+----------+----------+----------------------------------+
|   TEXT   |   DEP    |   HEAD   |            ANCESTORS             |
+----------+----------+----------+----------------------------------+
| Alphabet |  nsubj   | reported |             reported             |
|    ,     |  punct   | Alphabet |        Alphabet,reported         |
| Facebook | npadvmod | Alphabet |        Alphabet,reported         |
|    ,     |  punct   | Facebook |    Facebook,Alphabet,reported    |
|  Apple   |   conj   | Facebook |    Facebook,Alphabet,reported    |
|   and    |    cc    |  Apple   | Apple,Facebook,Alphabet,reported |
|  Amazon  |   conj   |  Apple   | Apple,Facebook,Alphabet,reported |
| reported |   ROOT   | reported |                                  |
|    a     |   det    | billion  |         billion,reported         |
| combined |   amod   | billion  |         billion,reported         |
|    $     | quantmod | billion  |         billion,reported         |
|    28    | compoun

In [23]:
# CHILDREN: A sequence of the token’s immediate syntactic dependents.
# LEFT CHILDREN: A sequence of the token’s immediate syntactic dependents that occur before the token.
# RIGHT CHILDREN: A sequence of the token’s immediate syntactic dependents that occur after the token.
# SUBTREE: token and all its direct and indirect dependents in the original order
# LEFT EDGE: first token of the token's subtree
# RIGHT EDGE: last token of the token's subtree
table = PrettyTable()
table.field_names = ["TEXT", "CHILDREN" , "LEFT CHILDREN" , "RIGHT CHILDREN", "LEFT EDGE", "RIGHT EDGE"]  
for token in doc: 
    table.add_row([token.text, 
                   ','.join([child.text for child in token.children]),
                   ','.join([child.text for child in token.lefts]),
                   ','.join([child.text for child in token.rights]),
#                    ','.join([child.text for child in token.subtree]),
                   token.left_edge.text,
                   token.right_edge.text
                  ])
print(table) 

+------------+-----------------------+---------------+----------------+------------+------------+
|    TEXT    |        CHILDREN       | LEFT CHILDREN | RIGHT CHILDREN | LEFT EDGE  | RIGHT EDGE |
+------------+-----------------------+---------------+----------------+------------+------------+
|   Which    |                       |               |                |   Which    |   Which    |
|   plant    |                       |               |                |   plant    |   plant    |
|   genus    |      Which,plant      |  Which,plant  |                |   Which    |   genus    |
|     is     |      genus,part,?     |     genus     |     part,?     |   Which    |     ?      |
|    part    |           of          |               |       of       |    part    | Rothmannia |
|     of     |         family        |               |     family     |     of     | Rothmannia |
|     a      |                       |               |                |     a      |     a      |
|   family   |      

In [21]:
table = PrettyTable()
table.field_names = ["TEXT",   "SUBTREE"]  
for token in doc: 
    table.add_row([token.text,  
                   ','.join([child.text for child in token.subtree]), 
                  ])
print(table) 

+------------+------------------------------------------------------------------------------------------+
|    TEXT    |                                         SUBTREE                                          |
+------------+------------------------------------------------------------------------------------------+
|   Which    |                                          Which                                           |
|   plant    |                                          plant                                           |
|   genus    |                                    Which,plant,genus                                     |
|     is     | Which,plant,genus,is,part,of,a,family,with,a,longer,name,,,Morning,glory,or,Rothmannia,? |
|    part    |            part,of,a,family,with,a,longer,name,,,Morning,glory,or,Rothmannia             |
|     of     |               of,a,family,with,a,longer,name,,,Morning,glory,or,Rothmannia               |
|     a      |                                

In [22]:
# Get definition of common tags and labels
print("amod: ", spacy.explain("amod"))  
print("prep: ", spacy.explain("prep")) 
print("pobj: ", spacy.explain("pobj"))
print("pcomp: ", spacy.explain("pcomp"))
print("pron: ", spacy.explain("PRON"))
print("pron: ", spacy.explain("PROPN"))

amod:  adjectival modifier
prep:  prepositional modifier
pobj:  object of preposition
pcomp:  complement of preposition
pron:  pronoun
pron:  proper noun


In [25]:
# Finding a verb with a subject 
from spacy.symbols import nsubj, VERB
verbs = set()
for token in doc:
    if token.dep == nsubj and token.head.pos == VERB:  # same as: if token.dep_ == 'nsubj' and token.head.pos_ == 'VERB'
        verbs.add(token.head)
print(verbs)

{reported}


In [26]:
# Finding a proper noun before a verb
for token in doc:
    # Check if the current token is a proper noun
    if token.pos_ == "PROPN":
        # Check if the next token is a verb
        if doc[token.i + 1].pos_ == "VERB":
            print("Found proper noun before a verb:", token.text)

Found proper noun before a verb: Amazon


##### Named Entity 

In [27]:
# Named Entity Type
# IOB Code: “B” means the token begins an entity, “I” means it is inside an entity, “O” means it is outside an entity, and "" means no entity tag is set.   
table = PrettyTable()
table.field_names = ["TEXT", "Named Entity Type" , "IOB Code"]  
for token in doc: 
    table.add_row([token.text, token.ent_type_, token.ent_iob_ ])
print(table)     

+----------+-------------------+----------+
|   TEXT   | Named Entity Type | IOB Code |
+----------+-------------------+----------+
| Alphabet |                   |    O     |
|    ,     |                   |    O     |
| Facebook |        ORG        |    B     |
|    ,     |                   |    O     |
|  Apple   |        ORG        |    B     |
|   and    |                   |    O     |
|  Amazon  |        ORG        |    B     |
| reported |                   |    O     |
|    a     |                   |    O     |
| combined |                   |    O     |
|    $     |       MONEY       |    B     |
|    28    |       MONEY       |    I     |
| billion  |       MONEY       |    I     |
|    in    |                   |    O     |
| profits  |                   |    O     |
|    on    |                   |    O     |
| Thursday |        DATE       |    B     |
|    .     |                   |    O     |
+----------+-------------------+----------+


#### <a href='https://spacy.io/usage/visualizers#dep' style='text-decoration:none'>Visualizing POS and Dependencies</a> 

In [27]:
from spacy import displacy
# displacy.serve(doc, style="dep")    # open a server when running outside of notebook
displacy.render(doc, style="dep")

In [29]:
# Get definition of common tags and labels
print("nsubj: ", spacy.explain("nsubj"))
print("dobj: ", spacy.explain("dobj"))
print("pcomp: ", spacy.explain("pcomp"))
print("pobj: ", spacy.explain("pobj"))

nsubj:  nominal subject
dobj:  direct object
pcomp:  complement of preposition
pobj:  object of preposition


In [30]:
# Visualizing long texts:  Visualize per sentence
text = """In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus. A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom. One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates). One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero. Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin)."""
doc = nlp(text)
sentence_spans = list(doc.sents)
displacy.render(sentence_spans, style="dep")     

#### <a href='https://spacy.io/usage/visualizers#ent' style='text-decoration:none'>Named Entities</a>
- Named Entities: real world objects <br>
- <a href='https://spacy.io/api/annotation#named-entities' style='text-decoration:none'>Named Entity Types</a> 

In [24]:
# Start: The word offset of start of entity in the Doc.
# End: The word offset of end of entity in the Doc.
# Start Char: The character offset of start of entity in the Doc.
# End Char: The character offset of end of entity in the Doc.
# Label: Entity label, i.e. type.
table = PrettyTable()
table.field_names = ["TEXT", "Named Entity Label", "Start" , "End", "Start Char" , "End Char"]  
for entity in doc.ents:    # each entity is a span
    table.add_row([entity.text, entity.label_, entity.start, entity.end, entity.start_char, entity.end_char])
print(table)     

+-----------------------------------------+--------------------+-------+-----+------------+----------+
|                   TEXT                  | Named Entity Label | Start | End | Start Char | End Char |
+-----------------------------------------+--------------------+-------+-----+------------+----------+
|               chris jones               |       PERSON       |   0   |  2  |     0      |    11    |
|                  welsh                  |        NORP        |   3   |  4  |     15     |    20    |
|                  cymru                  |        ORG         |   9   |  10 |     71     |    76    |
|                porthmadog               |        ORG         |   12  |  13 |     91     |   101    |
|            leeds united jones           |        ORG         |   17  |  20 |    128     |   146    |
|            his fourth season            |        DATE        |   23  |  26 |    163     |   180    |
|                  leeds                  |        GPE         |   42  | 

In [28]:
displacy.render(doc, style="ent")

In [None]:
# manually set new named entity
from spacy.tokens import Span
doc = nlp("fb is hiring a new vice president of global policy")
fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity
doc.ents = list(doc.ents) + [fb_ent]

ents = [(e.text, e.start, e.end, e.label_) for e in doc.ents]
print('New entity', ents)

#### Noun Phrases / Chunks 

In [26]:
# Noun Chunks: flat phrases that have a noun as their head. Basically, a noun plus the words describing the noun 
table = PrettyTable()
table.field_names = ["TEXT", "CHUNK's ROOT", "DEP", "CHUNK's HEAD"]  
for chunk in doc.noun_chunks:
    table.add_row([chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text])
print(table) 

+------------------------------------------------------+--------------+-------+--------------+
|                         TEXT                         | CHUNK's ROOT |  DEP  | CHUNK's HEAD |
+------------------------------------------------------+--------------+-------+--------------+
|                     chris jones                      |    jones     | nsubj |      is      |
|          welsh semiprofessional footballer           |  footballer  |  attr |      is      |
|            cymru alliance side porthmadog            |  porthmadog  |  pobj |     for      |
|                 former professional                  | professional | appos |  porthmadog  |
|                  leeds united jones                  |    jones     |  pobj |     with     |
|                  his fourth season                   |    season    |  pobj |      in      |
|                         city                         |     city     |  pobj |     with     |
|                         what                    

In [31]:
from spacy.tokens import Span
text = "brown state fishing lake is in country that has population of how many inhabitants"
doc = nlp(text)
for chunk in doc.noun_chunks:
    print('chunk: ', chunk.text) 
    print('root: ',  chunk.root.text)
    span = Span(doc, chunk.root.i, chunk.root.i+1)
    print(span)
    print(span.start)

chunk:  brown state fishing lake
root:  lake
lake
3
chunk:  country
root:  country
country
6
chunk:  population
root:  population
population
9
chunk:  how many inhabitants
root:  inhabitants
inhabitants
13


In [36]:
from spacy.symbols import *

np_labels = set([nsubj, nsubjpass, dobj, iobj, pobj]) # Probably others too
 
def iter_nps(doc):
    for word in doc:
        if word.dep in np_labels:
            print(word.text, word.dep_, list(word.subtree) )
iter_nps(doc)

lake nsubj [brown, state, fishing, lake]
country pobj [country, that, has, population, of, how, many, inhabitants]
that nsubj [that]
population dobj [population, of, how, many, inhabitants]
inhabitants pobj [how, many, inhabitants]


In [10]:
# text = "Who was the writer of 'These Boots Are Made for Walkin' and who died in 2007?"
text = 
# text = 'aside from apple remote what other device can control program apple remote was originally designed to interact with'
doc = nlp(text)
displacy.render(doc, style="dep")
for chunk in doc.noun_chunks:
    print('chunk: ', chunk.text) 
    print('root: ',  chunk.root.text) 

chunk:  Who
root:  Who
chunk:  the writer
root:  writer
chunk:  These Boots
root:  Boots
chunk:  Walkin
root:  Walkin
chunk:  who
root:  who


##### <a href='https://github.com/DerwenAI/pytextrank' style='text-decoration:none'>PyTextRank</a> <br>

In [32]:
#!python -m pip install pytextrank
# Fan: make 3 changes in pytextrank.py 
# 1. phrase_text = ' '.join(key[0] for key in phrase_key) 
#  p.text are the joint of lemma tokens with pos_ in kept_pos, and maintain the order when join    
# 2. add argumrnt 'chunk_type' to only consider named entity ('ner') or noun_chunks ('noun'), besides the default ('both') 
# 3. replace token.lemma_ with token.lemma_.lower().strip()

import pytextrank
tr = pytextrank.TextRank(pos_kept=["ADJ", "NOUN", "PROPN", "VERB", "NUM", "ADV"], chunk_type='both')  
nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)

In [33]:
text = 'This singer A Rather Blustery Day voiced hedgehog?'
phrase_doc = nlp(text)
for p in phrase_doc._.phrases:
    print((p.text, p.rank))

('rather blustery day', 0.19461197585339468)
('hedgehog', 0.1754565302471306)
('singer', 0.1500201908274722)


#### <a href='https://spacy.io/api/entitylinker' style='text-decoration:none'>EntityLinker</a> 
- An EntityLinker component disambiguates textual mentions (tagged as named entities) to unique identifiers, grounding the named entities into the “real world”. (need spacy 3.0)
- <a href='https://drive.google.com/file/d/1EuGxcQLcXvjjkZ-KRUlwpr_doBVyEBEG/view' style='text-decoration:none'> Slides: Entity linking functionality in spaCy</a> 


In [None]:
from spacy.pipeline.entity_linker import DEFAULT_NEL_MODEL
config = {
   "labels_discard": [],
   "n_sents": 0,
   "incl_prior": True,
   "incl_context": True,
   "model": DEFAULT_NEL_MODEL,
   "entity_vector_length": 64,
   "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'},
}
nlp.add_pipe("entity_linker", config=config)

#### <a href='https://github.com/huggingface/neuralcoref' style='text-decoration:none'>Coreference Resolution</a>
- <a href='https://medium.com/huggingface/state-of-the-art-neural-coreference-resolution-for-chatbots-3302365dcf30' style='text-decoration:none'>blog</a>
- <a href='https://github.com/huggingface/neuralcoref-viz' style='text-decoration:none'>NeuralCoref-Viz</a>

In [3]:
## had to install neuralcoref from source because of 'segament fault' issue:  https://github.com/huggingface/neuralcoref/issues/164

# !git clone https://github.com/huggingface/neuralcoref.git
# import os
# os.chdir('/home/u32/fanluo/Jupyter/experiments/spaCy/neuralcoref')
# !pip install -r requirements.txt 
# !pip install -e .  

In [15]:
import neuralcoref
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7f4dbc467fa0>

NeuralCoref is made of two sub-modules:
- a rule-based mentions-detection module which uses SpaCy's tagger, parser and NER annotations to identify a set of potential coreference mentions 
- a feed-forward neural-network which compute a coreference score for each pair of potential mentions.

|Attribute|Type	|Description|
|:---|:---|:---|
|doc._.has_coref	|boolean	|Has any coreference has been resolved in the Doc|
|doc._.coref_clusters	|list of Cluster	|All the clusters of corefering mentions in the doc|
|**doc._.coref_resolved**	|unicode	|Unicode representation of the doc where each corefering mention is replaced by the main mention in the associated cluster.|
|doc._.coref_scores	|Dict of Dict	|Scores of the coreference resolution between mentions.|
|span._.is_coref	|boolean	|Whether the span has at least one corefering mention|
|span._.coref_cluster	|Cluster	|Cluster of mentions that corefer with the span|
|span._.coref_scores	|Dict	|Scores of the coreference resolution of & span with other mentions (if applicable).|
|token._.in_coref	|boolean	|Whether the token is inside at least one corefering mention|
|token._.coref_clusters	|list of Cluster	|All the clusters of corefering mentions that contains the token|

In [19]:
doc = nlp("My sister has a dog. She is the the person whose house locates in Beijing.")  
doc._.has_coref

True

In [20]:
doc._.coref_resolved

'My sister has a dog. My sister is the the person whose house locates in Beijing.'

In [17]:
table = PrettyTable()
table.field_names = ["Idx", "Referent", "Corefering Mentions"]  
for coref_cluster in doc._.coref_clusters: 
    table.add_row([coref_cluster.i, coref_cluster.main.text, ', '.join([m.text for m in coref_cluster.mentions]) ])
print(table)     

+-----+-----------+---------------------+
| Idx |  Referent | Corefering Mentions |
+-----+-----------+---------------------+
|  0  | My sister |    My sister, She   |
+-----+-----------+---------------------+


In [21]:
for mention, mention_scores in doc._.coref_scores.items(): 
    print(mention, '|', str(mention_scores))

My sister | {My sister: 1.3110305070877075}
a dog | {a dog: 1.804752230644226, My sister: -1.6715972423553467}
She | {She: -0.10834169387817383, My sister: 8.058427810668945, a dog: -1.0625176429748535}
him | {him: -1.8707444667816162, My sister: 3.1147196292877197, a dog: 4.356405258178711, She: -3.1379525661468506}


In [15]:
table = PrettyTable()
table.field_names = ["Token", "In Corefering Mention", "Coreferrence Clusters"]  
for token in doc: 
    table.add_row([token.text, token._.in_coref, str(token._.coref_clusters)])
print(table) 

+--------+-----------------------+-------------------------------+
| Token  | In Corefering Mention |     Coreferrence Clusters     |
+--------+-----------------------+-------------------------------+
|   My   |          True         | [My sister: [My sister, She]] |
| sister |          True         | [My sister: [My sister, She]] |
|  has   |         False         |               []              |
|   a    |          True         |     [a dog: [a dog, him]]     |
|  dog   |          True         |     [a dog: [a dog, him]]     |
|   .    |         False         |               []              |
|  She   |          True         | [My sister: [My sister, She]] |
| loves  |         False         |               []              |
|  him   |          True         |     [a dog: [a dog, him]]     |
|   .    |         False         |               []              |
+--------+-----------------------+-------------------------------+


### Word vectors and similarity 

#### word vectors
- Small models such as *en_core_web_**sm*** does not include word vectors，needs larger models: *en_core_web_**md*** or *en_core_web_**lg***
- Doc.vector and Span.vector will default to an average of their token vectors.
- out-of-vocabulary – so its vector representation consists of 300 dimensions of 0.
- <a href='https://spacy.io/usage/vectors-similarity#custom' style='text-decoration:none'>Customizing word vectors</a> 

In [7]:
# !python -m spacy download en_core_web_md
import en_core_web_md
nlp = en_core_web_md.load()       # load pretrained models 
doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")

In [31]:
# Get the vector for the token "restaurant"
restaurant_vector = doc[4].vector
print(restaurant_vector)

[ 4.7022e-01  2.7832e-02  3.3726e-01 -6.0538e-01  1.1151e+00  6.1317e-01
  4.4317e-01 -4.4356e-01  1.7839e-01  2.5339e+00 -6.6669e-01  1.1980e-01
 -3.7354e-01 -1.2704e-01 -2.5155e-01 -2.0857e-01 -2.7643e-02  9.6150e-01
  1.2078e-01 -4.7681e-01 -4.1337e-01 -2.9158e-01  2.9513e-01 -6.5193e-01
  3.1126e-01  4.4229e-02 -8.5315e-01  5.2994e-02 -1.5573e-01  8.3080e-02
  5.0069e-01 -1.6684e-01  5.6950e-01 -3.2449e-01 -5.8970e-01  1.9531e-01
  6.2275e-02  7.3909e-02 -1.0965e-01  1.9190e-01 -3.3088e-01 -1.0520e-01
  2.5342e-01 -8.1830e-02 -9.4465e-02  4.1847e-01 -9.9957e-02 -2.7197e-01
 -1.4081e-01  2.1018e-02 -1.7947e-01 -4.2881e-01  5.5106e-01  3.5615e-01
 -1.9499e-01 -2.2572e-01  1.6898e-01 -3.1127e-01  3.6404e-01  2.2121e-01
  4.1063e-01 -6.7738e-01  1.8249e-02  3.5553e-01 -6.6892e-02 -8.3799e-01
 -6.6448e-02  2.2761e-02  4.7108e-01  8.5194e-01 -1.8790e-01  2.3339e-01
 -3.2439e-01 -1.7117e-01  1.9239e-01  8.3423e-02  1.5147e-03 -7.8561e-01
 -1.8561e-01  2.3470e-01 -1.4797e-01 -4.0575e-01  5

#### vector similarity

In [32]:
token1, token2 = doc[4], doc[14]
print(token1.text, token2.text)

# Get the similarity of the tokens "restaurant" and "bar"
similarity = token1.similarity(token2)
print(similarity)

restaurant bar
0.6205604


In [33]:
# Create spans for "great restaurant" and "really nice bar"
span1 = doc[3:5]
span2 = doc[12:15]

# Get the similarity of the spans
similarity = span1.similarity(span2)
print(similarity)

0.75173926


In [34]:
doc1 = nlp("It's a warm summer day")
doc2 = nlp("It's sunny outside")

# Get the similarity of doc1 and doc2
similarity = doc1.similarity(doc2)
print(similarity)

0.8789265574516525


### More

In [35]:
IFrame('https://course.spacy.io/en/', width=800, height=450)   # spaCy tutorial