In [1]:
import spacy

nlp=spacy.load("en_core_web_sm")

data=nlp("When you look into your mother's eyes, you know that is the purest love you can find on this earth.")

# Matching patterns

In [2]:
from spacy.matcher import Matcher
matcher=Matcher(nlp.vocab)

pattern=[{"TEXT":"purest"},{"TEXT":"love"}]
matcher.add("parent",None,pattern)

matches=matcher(data)
matches

[(125140298697823262, 14, 16)]

In [3]:
data[14:16]


purest love

# Data Structures:
## Vocab,Lexemes and StringStore

### Shared vocab and string store

In [4]:
love_hash=data.vocab.strings["love"]  #nlp.vocab.string["love"]
print("hash value:",love_hash)


love_string=data.vocab.strings[love_hash]
print("string value:",love_string)

hash value: 3702023516439754181
string value: love


### Lexemes

In [5]:
lexem=data.vocab["love"]  #nlp.vocab["love"]
print(lexem.text,lexem.orth,lexem.is_alpha)  #lexem.orth=hash value

love 3702023516439754181 True


## Doc,Span and Token

In [6]:
from spacy.tokens import Doc
words=["eye","love","earth"]
spaces=["True","False","False"]

doc=Doc(data.vocab,words=words,spaces=spaces)
doc

eye love earth 

In [7]:
from spacy.tokens import Span
span=Span(doc,0,2)
#add label with span data
span_label=Span(doc,0,2, label="parents_love")
span_label

eye love

# Combining models and rules

## PhraseMatcher

In [8]:
from spacy.matcher import PhraseMatcher
pmatcher=PhraseMatcher(nlp.vocab)
pattern2=nlp("purest love")
pmatcher.add("love",None,pattern2)
phrase_matcher=pmatcher(data)
phrase_matcher


[(3702023516439754181, 14, 16)]

In [9]:
data[14:16]

purest love

# Processing pipelines

In [10]:
print(nlp.pipe_names)
print(nlp.pipeline)

['tagger', 'parser', 'ner']
[('tagger', <spacy.pipeline.pipes.Tagger object at 0x000001F8A87F8208>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x000001F8A86D8CA8>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x000001F8A86D8D68>)]


# Custom pipeline components

In [11]:
def custom_components(doc):
    print("Document length:",len(doc))
    return doc

nlp.add_pipe(custom_components,first=True)
nlp("i love my parents")
#print("pipeline:",nlp.pipe_names)

Document length: 4


i love my parents

In [12]:
print("pipeline:",nlp.pipe_names)

pipeline: ['custom_components', 'tagger', 'parser', 'ner']


# Extension attributes

## Attribute extension

In [13]:
from spacy.tokens import Token
Token.set_extension("is_color",default=False)
print(data[5]._.is_color,data[5].text)

False mother


## Property extension

In [15]:
def get_is_color(token):
    color=["red","Yellow","blue"]
    return token.text in color

Token.set_extension("is_color", getter=get_is_color,force=True)
print(data[5]._.is_color,"-",data[5].text)

False - mother


## Method extension

In [16]:
def has_token(data2,token_text):
    in_doc=token_text in[token.text for token in doc]
    return in_doc

Doc.set_extension("has_token",method=has_token,force=True)
data2=nlp("the sky is blue")
print(data2._.has_token("blue"),"- blue")

Document length: 4
False - blue


# Scaling and performance

In [17]:
data=[("i love my parents",{"id":1,"page_num":20}),("and i always stay with them",{"id":2,"page_num":30})]
for doc,context in nlp.pipe(data,as_tuples=True):
    print(doc.text,context["page_num"])

Document length: 4
Document length: 6
i love my parents 20
and i always stay with them 30


In [18]:
doc=nlp.make_doc("hello world")
doc

hello world

In [19]:
with nlp.disable_pipes("tagger","parser"):  #disable unwanted pipelines
    doc=nlp("when you look")
    #print(doc.ents)

Document length: 3


# Best Practice

## Overfitting leads to forgetting data
* avoid bias in training data

## Local context may not be enough to make decision
* make consistent and not too specific label scheme
* plan schemes carefully