In [1]:
import spacy
from spacy.tokens import Doc, Token, Span
from spacy.matcher import PhraseMatcher

## Extensions
____________________

*This notebook is based on / inspired by excellent course materials from https://campus.datacamp.com/courses/advanced-nlp-with-spacy course at DataCamp*

____________________

To set an extension we use `.set_extension()` method. This method can be used on:

* `Doc`

* `Span`

* `Token`


Let's see a set of examples:

* `Doc.set_extension('title', default = None)`

* `Span.set_extension('is_german_word', default = False)`

* `Token.set_extension('has_color', default = False)`

To access extensions we need to use `._.` to distinguish them from built-in properties:

`doc._.title = 'Document 1'`

#### Types of extensions

* Attribute extensions

* Property extensions

* Method extensions

In [2]:
# Get the model
nlp = spacy.load('en_core_web_sm')

In [3]:
# Define some data
my_str = 'I used to live in Vienna 5 years ago.'
doc1 = nlp(my_str)

### Token level extensions

In [4]:
# Token level attributes
Token.set_extension('is_city', default = False)

In [5]:
# Set extension
doc1[5]._.is_city = True

# Let's see how it works
print([(token.text, token._.is_city) for token in doc1])

[('I', False), ('used', False), ('to', False), ('live', False), ('in', False), ('Vienna', True), ('.', False)]


### Doc level extensions

In [4]:
# Define a getter function

def get_has_number(doc):
    # If any token is like_num - return True
    return any(token.like_num for token in doc)


In [5]:
# Register the Doc property extension 'has_number' with the getter get_has_number
Doc.set_extension('has_number', getter = get_has_number)

In [6]:
# Check how it works
print('has_number:', doc1._.has_number)

has_number: True


### Span level extensions

In [7]:
# Define a method

def to_html(span, tag):
    # Wrap the span text in a HTML tag and return it
    return '<{tag}>{text}</{tag}>'.format(tag=tag, text=span.text)

In [8]:
# Register the Span property extension 'to_html' with the method to_html
Span.set_extension('to_html', method = to_html)

In [9]:
# Process the text and call to_html method on the span with `h1` tag
doc = nlp("Hello world, this is my sentence.")
span = doc[0:2]
print(span._.to_html('h1'))

<h1>Hello world</h1>


## Extensions and entities

### Example 1

In [11]:
def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ('PERSON', 'ORG', 'GPE', 'LOCATION'):
        entity_text = span.text.replace(' ', '_')
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text
    

In [14]:
# Set the Span extension wikipedia_url using get getter get_wikipedia_url
Span.set_extension('wikipedia_url', getter = get_wikipedia_url, force = True)

In [15]:
doc = nlp("In over fifty years from his very first recordings right through to his last album, David Bowie was at the vanguard of contemporary culture. Annie Lennox")
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent._.wikipedia_url)

fifty years None
David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie
Annie Lennox https://en.wikipedia.org/w/index.php?search=Annie_Lennox


### Example 2

In [2]:
nlp2 = spacy.load('en_core_web_sm')

In [3]:
# Let's define a list of districts in Tel Aviv
distrs = ['Old Yafo', 'Shapira', 'Ezra', 'Florentin']

# Add patterns
patterns = list(nlp2.pipe(distrs))

In [4]:
# Initialize Matcher
matcher = PhraseMatcher(nlp2.vocab)

matcher.add('DISTRICT', None, *patterns)

In [5]:
def tlv_component(doc_):
    
    # Apply the matcher to the doc
    matches = matcher(doc_)
    
    # Create a Span for each match and assign the label 'TLV_DISTRICT'
    spans = [Span(doc_, start, end, label = 'TLV_DISTRICT')
             for match_id, start, end in matches]
    
    # Overwrite the doc.ents with the matched spans
    doc_.ents = tuple(spans)
    
    return doc_

In [6]:
district_loc = {
    'Old Yafo': 'Southwest',
    'Shapira': 'South',
    'Ezra': 'Southeast',
    'Florentin': 'South'
}

In [7]:
# Add the component to the pipeline after the 'ner' component 
nlp2.add_pipe(tlv_component, after = 'ner')

In [8]:
nlp2.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x204733c9dd8>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x204735280a8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x20473528108>),
 ('tlv_component', <function __main__.tlv_component(doc_)>)]

In [9]:
# Create a document
doc2 = nlp2('I stayed in Old Yafo for a couple of days and then moved to Shapira to visit my friends. They told me\
that in their opinion Ezra is nicer than Florentin. I disagreed.')

In [10]:
# Register district_loc and getter that looks up the span text in TLV districts
Span.set_extension('district_location', getter = lambda span: district_loc[span.text], force = True)

In [14]:
for ent in doc2.ents:
    print(f'DISTRICT: {ent.text:10} | LABEL: {ent.label_:10} | LOCATION: {ent._.district_location}')

DISTRICT: Old Yafo   | LABEL: TLV_DISTRICT | LOCATION: Southwest
DISTRICT: Shapira    | LABEL: TLV_DISTRICT | LOCATION: South
DISTRICT: Ezra       | LABEL: TLV_DISTRICT | LOCATION: Southeast
DISTRICT: Florentin  | LABEL: TLV_DISTRICT | LOCATION: South
