In [58]:
import spacy

nlp=spacy.load("en_core_web_sm")

data=nlp("When you look into your mother's eyes, you know that is the purest love you can find on this earth.")

# Matching patterns

In [59]:
from spacy.matcher import Matcher
matcher=Matcher(nlp.vocab)

pattern=[{"TEXT":"purest"},{"TEXT":"love"}]
matcher.add("parent",None,pattern)

matches=matcher(data)
matches

[(125140298697823262, 14, 16)]

In [60]:
data[14:16]


purest love

In [96]:


matcher=Matcher(nlp.vocab)


pattern1=[{"IS_DIGIT":"True"},{"LOWER":"fifa"},{"LOWER":"world"},{"LOWER":"cup"},{"IS_PUNCT":"True"}]
matcher.add("fifa",None,pattern1)

doc1=nlp("2020 fifa world cup:who will win?")

matches1=matcher(doc1)

print(matches1)

[]


In [97]:


matcher=Matcher(nlp.vocab)

pattern2=[{"LEMMA":"love"},{"POS":"VERB"},{"POS":"NOUN"}]

matcher.add("love",None,pattern2)

doc2=nlp("i loved dogs but now i love cats more")

matches2=matcher(doc2)

print(matches2)

[]


# Data Structures:
## Vocab,Lexemes and StringStore

### Shared vocab and string store

In [61]:
love_hash=data.vocab.strings["love"]  #nlp.vocab.string["love"]
print("hash value:",love_hash)


love_string=data.vocab.strings[love_hash]
print("string value:",love_string)

hash value: 3702023516439754181
string value: love


### Lexemes

In [62]:
lexem=data.vocab["love"]  #nlp.vocab["love"]
print(lexem.text,lexem.orth,lexem.is_alpha)  #lexem.orth=hash value

love 3702023516439754181 True


## Doc,Span and Token

In [63]:
from spacy.tokens import Doc
words=["eye","love","earth"]
spaces=["True","False","False"]

doc=Doc(data.vocab,words=words,spaces=spaces)
doc

eye love earth 

In [64]:
from spacy.tokens import Span
span=Span(doc,0,2)
#add label with span data
span_label=Span(doc,0,2, label="parents_love")
span_label

eye love

# Word vectors and semantic similarity

In [87]:
nlp1=spacy.load("en_core_web_md")
data1=nlp1("We never know the love of a parent till we become parents ourselves.")
print(data.similarity(data1))

OSError: [E050] Can't find model 'en_core_web_lg'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [None]:
token=nlp("parent")
print(data.similarity(token))

# Combining models and rules

## PhraseMatcher

In [65]:
from spacy.matcher import PhraseMatcher
pmatcher=PhraseMatcher(nlp.vocab)
pattern2=nlp("purest love")
pmatcher.add("love",None,pattern2)
phrase_matcher=pmatcher(data)
phrase_matcher


[(3702023516439754181, 14, 16)]

In [66]:
data[14:16]

purest love

# Processing pipelines

In [67]:
print(nlp.pipe_names)
print(nlp.pipeline)

['tagger', 'parser', 'ner']
[('tagger', <spacy.pipeline.pipes.Tagger object at 0x000001BDF75934C8>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x000001BDF75A4B88>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x000001BDF75A4BE8>)]


# Custom pipeline components

In [68]:
def custom_components(doc):
    print("Document length:",len(doc))
    return doc

nlp.add_pipe(custom_components,first=True)
nlp("i love my parents")
#print("pipeline:",nlp.pipe_names)

Document length: 4


i love my parents

In [69]:
print("pipeline:",nlp.pipe_names)

pipeline: ['custom_components', 'tagger', 'parser', 'ner']


# Extension attributes

## Attribute extension

In [71]:
from spacy.tokens import Token
Token.set_extension("is_color",defult=False)
data[5]._.is_color=True

ValueError: [E083] Error setting extension: only one of `default`, `method`, or `getter` (plus optional `setter`) is allowed. Got: 0

## Property extension

In [170]:
def get_is_color1(token):
    color=["red","Yellow","blue"]
    return token.text in color

Token.set_extension("is_color", getter=get_is_color,force=True)
print(data[5]._.is_color,"-",data[5].text)

False - mother


In [169]:
def get_has_color(token):
    color=["red","Yellow","blue"]
    return any(token.text in color for token in span)

Span.set_extension("has_color",getter=get_has_color,force=True)

data1=nlp("the sky is blue")
print(data1[1:4]._.has_color,"-",data1[1:4].text)


False - sky is blue


## Method extension

In [168]:
def has_token(data2,token_text):
    in_doc=token_text in[token.text for token in doc]
    return in_doc

Doc.set_extension("has_token",method=has_token,force=True)
data2=nlp("the sky is blue")
print(data2._.has_token("blue"),"- blue")

False - blue


# Scaling and performance

In [72]:
data=[("i love my parents",{"id":1,"page_num":20}),("and i always stay with them",{"id":2,"page_num":30})]
for doc,context in nlp.pipe(data,as_tuples=True):
    print(doc.text,context["page_num"])

Document length: 4
Document length: 6
i love my parents 20
and i always stay with them 30


In [181]:
doc=nlp.make_doc("hello world")
doc

hello world

In [182]:
with nlp.disable_pipes("tagger","parser"):  #disable unwanted pipelines
    doc=nlp("when you look")
    #print(doc.ents)

ValueError: [E001] No component 'tagger' found in pipeline. Available names: []

# Training and updating models

# The training loop

In [184]:
training_data=[("how to preorder the iphone x",{"entities":[(29,28,"GADGET")]})]
for i in range(10):
    random.shuffle(training_data)
    for batch in spacy.util.minibatch(training_data):
        texts=[text for text,annotation in batch]
        annotations=[annotation for text,annotation in batch]
        nlp.update(texts,annotations)
nlp.to_disk(path_to_model)

NameError: name 'random' is not defined

# Best Practice

## Overfitting leads to forgetting data
* avoid bias in training data

## Local context may not be enough to make decision
* make consistent and not too specific label scheme
* plan schemes carefully