In [2]:
# increase the cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; } </style>"))
from IPython.display import IFrame

import sys
sys.path.insert(-1, '/xdisk/msurdeanu/fanluo/miniconda3/lib/python3.7/site-packages') 
from prettytable import PrettyTable

In [2]:
# !python -m pip install spacy==2.3.0
import spacy  
# !python -m spacy download en
# from spacy.lang.en import English
# nlp = English()

# !python -m spacy download en_core_web_sm   # pretrained models: https://spacy.io/usage/models 
import en_core_web_sm                        # a small English model trained on written web text (blogs, news, comments)
nlp = en_core_web_sm.load()                  # load pretrained models, return an instance of Language with a pipeline set and access to the binary data and language data

### <a href="https://spacy.io/usage/linguistic-features#native-tokenizers" style="text-decoration:none">Customize tokenization</a>

### <a href="https://spacy.io/usage/processing-pipelines#custom-components-attributes" style="text-decoration:none">Extensions</a>
Extensions are always added globally to Doc, Span, or Token

#### Atrribute extension

In [3]:
from spacy.tokens import Doc, Span, Token 
doc = nlp("Lily live in Spain.")

# register new attributes
Token.set_extension("is_name", default=False)
Span.set_extension("has_name", default=False)
Doc.set_extension("catgory", default=None)

# access and overwrite
doc[0]._.is_name = True
doc[0:2]._.has_name = True
doc._.catgory = "blog"

# can also use the built-in set, get and has methods to modify and retrieve the attributes. 
doc[0]._.set("is_name", True)   

In [5]:
# Register the Token extension attribute "is_country" with the default value False
Token.set_extension("is_country", default=False)

# Set the is_country attribute to True for the token "Spain"
doc[3]._.is_country = True
 
print([(token.text, token._.is_country) for token in doc])

[('Lily', False), ('live', False), ('in', False), ('Spain', True), ('.', False)]


#### Property extension
- Define a getter and an optional setter function.
- Getter and setter functions are called when retrieve the property
- Often use _attribute_ to refer to _property_, because they are very similar. <br> 
  Their main difference is one is commonly shared by tokens/spans/docs and another diverse according to the function.  

In [6]:
# Define a getter function
def get_is_color(token):
    colors = ["red", "yellow", "blue"]
    return token.text in colors

# Alternatively: 
# colors = ["red", "yellow", "blue"]
# get_is_color = lambda token: token.text in colors

Token.set_extension("is_color", getter=get_is_color)

doc = nlp("The sky is blue.")
print(doc[3].text , "is color: ", doc[3]._.is_color)

blue is color:  True


In [7]:
# Define a getter function
def get_has_color(obj):
    colors = ["red", "yellow", "blue"]
    return any(token.text in colors for token in obj)

# Alternatively: 
# get_has_color = lambda obj: any([t.text in colors for t in obj])

Span.set_extension("has_color", getter=get_has_color)
Doc.set_extension("has_color", getter=get_has_color)

doc = nlp("The sky is blue.")
print(doc[1:4].text, " has color: ", doc[1:4]._.has_color)
print(doc[0:2].text, " has color: ", doc[0:2]._.has_color)
print(doc.text, " has color: ", doc._.has_color )

sky is blue  has color:  True
The sky  has color:  False
The sky is blue.  has color:  True


In [8]:
# Define the getter function that takes a token and returns its reversed text
def get_reversed(token):
    return token.text[::-1]

# Register the Token property extension "reversed" with the getter get_reversed
Token.set_extension("reversed", getter=get_reversed)

for token in doc:
    print("reversed:", token._.reversed)

reversed: ehT
reversed: yks
reversed: si
reversed: eulb
reversed: .


In [9]:
# Define the getter function
def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text

# Set the Span extension wikipedia_url using get getter get_wikipedia_url
Span.set_extension("wikipedia_url", getter=get_wikipedia_url)

doc = nlp( "David Bowie was at the vanguard of contemporary culture.")
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent._.wikipedia_url)

David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie


#### Method extension
Can pass argument(s) to the extension function

In [10]:
# Define a method
def has_token(doc, token_text):
    in_doc = token_text in [token.text for token in doc]
    return in_doc

Doc.set_extension("has_token", method=has_token)

doc = nlp("The sky is blue.")
print("The doc has the token 'blue': ", doc._.has_token("blue"))
print("The doc has the token 'cloud': ", doc._.has_token("cloud"))

The doc has the token 'blue':  True
The doc has the token 'cloud':  False


### <a href="https://course.spacy.io/en/chapter3" style="text-decoration:none">Customize pipline</a>
A component receives a Doc object and modify it, and then return the modified Doc object

In [11]:
# Print current processing pipeline components 
print(nlp.pipeline)   

[('tagger', <spacy.pipeline.pipes.Tagger object at 0x7fdf7fcba390>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7fdf7fcafd08>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7fdf7fcafd68>)]


#### Add components 

#####  <a href="https://spacy.io/usage/processing-pipelines#built-in" style="text-decoration:none">Built-in pipeline components</a> 
Only apply the pipeline components we need for efficient processing

|COMPONENT STRING NAME | COMPONENT MODULE| DESCRIPTION | 
|:- |:------------|:------------|  
|tagger	|Tagger	|Assign part-of-speech-tags.
|parser	|DependencyParser	|Assign dependency labels.
|ner	|EntityRecognizer	|Assign named entities.
|entity_linker	|EntityLinker	|Assign knowledge base IDs to named entities. Should be added after the entity recognizer.
|textcat	|TextCategorizer	|Assign text categories.
|entity_ruler	|EntityRuler	|Assign named entities based on pattern rules.
|sentencizer	|Sentencizer	|Add rule-based sentence segmentation without the dependency parse.
|merge_noun_chunks	|merge_noun_chunks	|Merge all noun chunks into a single token. Should be added after the tagger and parser.
|merge_entities	|merge_entities	|Merge all entities into a single token. Should be added after the entity recognizer.
|merge_subtokens	|merge_subtokens	|Merge subtokens predicted by the parser into single tokens. Should be added after the parser.

In [12]:
# Option 1: Import and initialize
from spacy.pipeline import EntityRuler
ruler = EntityRuler(nlp)
nlp.add_pipe(ruler)

# Option 2: Using nlp.create_pipe
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)

print(nlp.pipeline) 

[('tagger', <spacy.pipeline.pipes.Tagger object at 0x7fdf7fcba390>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7fdf7fcafd08>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7fdf7fcafd68>), ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler object at 0x7fdf7fca3d68>), ('sentencizer', <spacy.pipeline.pipes.Sentencizer object at 0x7fdf7fca3080>)]


#####  <a href="https://spacy.io/usage/processing-pipelines#custom-components" style="text-decoration:none">Create components</a> 
- Add a component to the pipeline using the <a href="https://spacy.io/api/language#add_pipe" style="text-decoration:none">nlp.add_pipe</a> method.
- Can add the new component *before* or *after* a specified component, or add it *first* or *last* (default) in the pipeline.
- Can also replace the existing component with <a href="https://spacy.io/api/language#replace_pipe" style="text-decoration:none">nlp.replace_pipe</a> method.

In [13]:
def my_component(doc):
   # do something to the doc here
   return doc

nlp.add_pipe(my_component)                           # add at last (default) in the pipeline
# nlp.add_pipe(my_component, first=True)
# nlp.add_pipe(my_component, before="parser")
print(nlp.pipe_names)

['tagger', 'parser', 'ner', 'entity_ruler', 'sentencizer', 'my_component']


######  <a href="https://spacy.io/usage/processing-pipelines#component-example1" style="text-decoration:none">Customize Sentencizer component</a>

In [14]:
def custom_sentencizer(doc):
    for i, token in enumerate(doc[:-2]):
        # Define sentence start if pipe + titlecase token
        if token.text == "|" and doc[i+1].is_title:
            doc[i+1].is_sent_start = True
        else: 
            doc[i+1].is_sent_start = False
    return doc
 
nlp.add_pipe(custom_sentencizer, before="parser")  # Insert before the parser
doc = nlp("This is. A sentence. | This is. Another sentence.")
for sent in doc.sents:
    print(sent.text)

This is. A sentence. |
This is. Another sentence.


###### <a href="https://spacy.io/usage/processing-pipelines#component-example2" style="text-decoration:none">Customize NER component</a>

- To create entity spans from token-based tags: <a href='https://spacy.io/api/goldparse#spans_from_biluo_tags' style='text-decoration:none'>gold.spans_from_biluo_tags</a>
- Each token can only be part of one entity, so overlapping entity spans are not allowed.
- When adding spans to the *doc.ents*, the *Token.ent_type* and *Token.ent_iob* attributes of their underlying tokens would be set automatically.


Example1: animals

In [15]:
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

# Define the custom component
def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc


# Add the component to the pipeline after the "ner" component
nlp.add_pipe(animal_component, after="ner")
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

['tagger', 'parser', 'ner', 'animal_component']
[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


Example2: tech companies

In [17]:
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token

class TechCompanyRecognizer(object):
    """
    Example of a spaCy v2.0 pipeline component that sets entity annotations based on list of single or multiple-word company names. 
    - Companies are labelled as ORG
    - company spans are merged into one token. 
    - ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token respectively.
    """ 

    def __init__(self, nlp, companies=tuple(), label="ORG"):
        """Initialise the pipeline component."""
        
        self.label = label               

        # Set up the PhraseMatcher 
        patterns = [nlp(org) for org in companies]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add("TECH_ORGS", None, *patterns)

        # Register attribute 
        Token.set_extension("is_tech_org", default=False)

        # Register properties on Doc and Span 
        Doc.set_extension("has_tech_org", getter=self.has_tech_org)
        Span.set_extension("has_tech_org", getter=self.has_tech_org)

    def __call__(self, doc):
        """Apply the pipeline component on a Doc object"""
        
        # Apply the matcher to the doc
        matches = self.matcher(doc)
        
        # Create a Span for each match and assign the label  
        spans = []        
        for _, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            for token in entity:
                token._.set("is_tech_org", True)
            doc.ents = list(doc.ents) + [entity]
        
        # Merge tokens in each span to one token
        for span in spans:
            span.merge()      
            
        return doc   

    def has_tech_org(self, tokens):
        """
        Getter for Doc and Span properties. 
        Returns True if one of the tokens is a tech org.  
        """
        return any([t._.get("is_tech_org") for t in tokens])

nlp = spacy.load("en_core_web_sm")
component = TechCompanyRecognizer(nlp, companies = ["Alphabet Inc.", "Google", "Netflix", "Apple"])  # initialize 
nlp.add_pipe(component, last=True)                         # add last to the pipeline

with nlp.disable_pipes("ner"):   # otherwise, set conflicting doc.ents. A token can only be part of one entity 
    doc = nlp("Alphabet Inc. is the company behind Google.") 
    print("Pipeline", nlp.pipe_names)                          # pipeline contains component name
print("Tokens", [t.text for t in doc])                     # company names from the list are merged
print("Doc has_tech_org", doc._.has_tech_org)              # Doc contains tech orgs
print("Token 0 is_tech_org", doc[0]._.is_tech_org)         # "Alphabet Inc." is a tech org
print("Token 1 is_tech_org", doc[1]._.is_tech_org)         # "is" is not
print("Entities", [(e.text, e.label_) for e in doc.ents])  # all orgs are entities

Pipeline ['tagger', 'parser', 'TechCompanyRecognizer']
Tokens ['Alphabet Inc.', 'is', 'the', 'company', 'behind', 'Google', '.']
Doc has_tech_org True
Token 0 is_tech_org True
Token 1 is_tech_org False
Entities [('Alphabet Inc.', 'ORG'), ('Google', 'ORG')]




####  <a href="https://spacy.io/usage/processing-pipelines#disabling" style="text-decoration:none">Disable components</a> 
- Disabled components would **not be called** for efficient processing
- print(nlp.pipeline) still see the disabled components, because they are not removed

In [18]:
# option1 
nlp = spacy.load("en_core_web_sm", disable=["tagger"])
doc = nlp("I won't be tagged")
print("disable: ", doc[0].pos_)

disable:  


In [19]:
# option2
texts = ["This is a text", "These are lots of texts", "..."]  
docs = list(nlp.pipe(texts, disable=["parser"]))
docs[0][1].dep_

''

In [20]:
# option3: temporarily disable and automatically restore
with nlp.disable_pipes("ner"):
    doc = nlp("I won't recognize the named entity Arizona")
    print("disable: ", doc.ents)
    
doc = nlp("I recognize the named entity Arizona")
print("not disable: ", doc.ents)

disable:  ()
not disable:  (Arizona,)


In [21]:
# option4: disable and maually restore
disabled = nlp.disable_pipes("ner")
doc = nlp("I won't recognize the named entity Arizona")
print("disable: ", doc.ents)

disabled.restore()
doc = nlp("I recognize the named entity Arizona")
print("not disable: ", doc.ents)

disable:  ()
not disable:  (Arizona,)


### <a href="https://spacy.io/usage/processing-pipelines#custom-components-user-hooks" style="text-decoration:none">Customize the built-in methods: User hooks</a>
- The built-in method will check the user_hooks dict, and delegate to hook function if set one. 
- The hooks only live on the Doc object.

|Hook Name	|Built-in methods|
|:---|:---|
|user_hooks|Doc.vector, Doc.has_vector, Doc.vector_norm, Doc.sents|
|user_token_hooks	|Token.similarity, Token.vector, Token.has_vector, Token.vector_norm, Token.conjuncts|
|user_span_hooks|Span.similarity, Span.vector, Span.has_vector, Span.vector_norm, Span.root|

In [None]:
class SimilarityModel(object):
    def __init__(self, model):
        self._model = model

    def __call__(self, doc):
        doc.user_hooks["similarity"] = self.similarity
        doc.user_span_hooks["similarity"] = self.similarity
        doc.user_token_hooks["similarity"] = self.similarity

    def similarity(self, obj1, obj2):
        y = self._model([obj1.vector, obj2.vector])
        return float(y[0])

### <a href="https://spacy.io/usage/vectors-similarity#custom" style="text-decoration:none">Customize word vectors</a>

### <a href="https://spacy.io/usage/training" style="text-decoration:none">Train Models</a>

### <a href="https://spacy.io/usage/rule-based-matching#models-rules" style="text-decoration:none">Combine models and rules</a>

### <a href="https://spacy.io/usage/visualizers" style="text-decoration:none">Customize visulization</a>

#### <a href="https://spacy.io/api/top-level#options-dep" style="text-decoration:none">Dependency Visualizer options</a>

|NAME	|TYPE	|DESCRIPTION|	DEFAULT|
|:---|:---|:---|:---|
|fine_grained	|bool	|Use fine-grained part-of-speech tags (Token.tag_) instead of coarse-grained tags (Token.pos_).	|False
|add_lemma|bool	|Print the lemma’s in a separate row below the token texts.	|False
|collapse_punct	|bool	|Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation.	|True
|collapse_phrases	|bool	|Merge noun phrases into one token.	|False
|compact	|bool	|“Compact mode” with square arrows that takes up less space.	|False
|color	|unicode	|Text color (HEX, RGB or color names).	|'#000000'
|bg	|unicode	|Background color (HEX, RGB or color names).	|'#ffffff'
|font	|unicode	|Font name or font family for all text.	|'Arial'
|offset_x	|int	|Spacing on left side of the SVG in px.	|50
|arrow_stroke	|int	|Width of arrow path in px.	|2
|arrow_width	|int	|Width of arrow head in px.	|10 / 8 (compact)
|arrow_spacing	|int	|Spacing between arrows in px to avoid overlaps.	|20 / 12 (compact)
|word_spacing	|int	|Vertical spacing between words and arcs in px.	|45
|distance	|int	|Distance between words in px.	|175 / 150 (compact)

In [None]:
import spacy
from spacy import displacy
options = {"compact": True, "bg": "#09a3d5", "color": "white", "font": "Source Sans Pro"}
displacy.render(doc, style="dep", options=options)

#### <a href="https://spacy.io/api/top-level#displacy_options-ent" style="text-decoration:none">Named Entity Visualizer options</a>

|NAME	|TYPE	|DESCRIPTION|	DEFAULT|
|:---|:---|:---|:---|
|ents	|list	|Entity types to highlight	|None
|colors	|dict	|Color overrides	|{}
|template |unicode	|Optional template to overwrite the HTML |see <a href="https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py" style="text-decoration:none">templates.py</a>| 


In [None]:
options = {"ents": ["PERSON", "ORG", "PRODUCT"], "colors": {"ORG": "yellow"}}
displacy.render(doc, style="ent", options=options)

#### <a href="https://spacy.io/usage/visualizers#manual-usage" style="text-decoration:none">Visulize user data</a>  
- when using style="ent, make sure the data in the right order, i.e. starting with the lowest start position.

In [None]:
ex = [{
       "text": "But Google is starting from behind.",
       "ents": [{"start": 4, "end": 10, "label": "ORG"}],
       "title": None
     }]
displacy.render(ex, style="ent", manual=True)

In [None]:
ex = [{
        "words": [
            {"text": "This", "tag": "DT"},
            {"text": "is", "tag": "VBZ"},
            {"text": "a", "tag": "DT"},
            {"text": "sentence", "tag": "NN"}
        ],
        "arcs": [
            {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
            {"start": 2, "end": 3, "label": "det", "dir": "left"},
            {"start": 1, "end": 3, "label": "attr", "dir": "right"}
        ]
     }]
displacy.render(ex, style="dep", manual=True)

#### <a href="https://spacy.io/usage/visualizers#ent-titles" style="text-decoration:none">Add title</a> 
- Add a headline to each visualization
- Use for a brief description 

In [None]:
doc = nlp("This is a sentence about Google.")
doc.user_data["title"] = "This is a title"
displacy.render(doc, style="ent")

#### <a href="https://spacy.io/usage/visualizers#examples-export-svg" style="text-decoration:none">Export SVG image</a> 
- SVG (Scalable Vector Graphics) image format uses XML markup.
- SVG can be embedded online in an \<img\> tag, or inlined in an HTML document. 

In [None]:
from pathlib import Path
svg = displacy.render(doc, style="dep")
output_path = Path("/images/sentence.svg")   # or file_name = '-'.join([w.text for w in doc if not w.is_punct]) + ".svg"
output_path.open("w", encoding="utf-8").write(svg)

#### <a href="https://spacy.io/usage/visualizers#webapp" style="text-decoration:none">Embed into a webpage</a>  
- <a href="https://github.com/explosion/displacy" style="text-decoration:none">displaCy.js</a>  
- <a href="https://github.com/kengz/spacy-nlp" style="text-decoration:none">spacy-nlp</a>: Expose Spacy nlp text parsing to Nodejs via socketIO
- example: <a href="https://explosion.ai/demos/displacy" style="text-decoration:none">online demo</a> 