## Predicting named entities in Context

In [71]:
import spacy

In [72]:
nlp = spacy.load("en_core_web_sm")

In [73]:
text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

In [15]:
#Process the text
doc = nlp(text)


In [16]:
#Iterate over the entities
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG


In [17]:
#Get the span for "iphone X"
iphone_x = doc[1:3]

In [18]:
print("Missing entity:", iphone_x.text)

Missing entity: iPhone X


## Rule / based matching

In [32]:
#Import the Matcher
from spacy.matcher import Matcher


In [33]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders")


In [34]:
#Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

In [40]:
#Create a pattern matching two tokens: iPhone and X
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]

In [41]:
#Add the pattern to the Matcher
matcher.add("IPHONE_X_PATTERN",[pattern])

In [42]:
#Use the matcher on the doc
matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X']


### Writing match patterns


In [74]:
import spacy
from spacy.matcher import Matcher

In [75]:
nlp =spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

In [76]:
doc =nlp(
"""iOS 11 is the eleventh major release of the iOS mobile operating system developed by Apple Inc., 
being the successor to iOS 10. It was announced at the company's Worldwide Developers Conference on June 5, 2017, 
and released on September 19, 2017. It was succeeded by iOS 12 on September 17, 2018.
"""
)


In [77]:
#Write a pattern for full iOS versions
pattern =[{"TEXT":"iOS"}, {"IS_DIGIT": True}]

In [78]:
#Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

Total matches found: 3


In [79]:
#Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Match found: iOS 11
Match found: iOS 10
Match found: iOS 12


In [80]:
### Write one pattern that matches forms of"download" followed by a token with the part of speech tag "PROPN" (proper noun)

In [81]:
import spacy
from spacy.matcher import Matcher

nlp =spacy.load("en_core_web_sm")
matcher =Matcher(nlp.vocab)

In [82]:
doc =nlp(
"""These links download GIMP installer for Windows (~200 MB). The installer contains both 32-bit and 64-bit versions of GIMP, and will automatically use the appropriate one.
BitTorrent is a peer-to-peer file sharing system. It works by downloading GIMP from a distributed network of BitTorrent users, and may improve download speed dramatically. 
Choosing this option will download file for the GIMP downloader. 
You may need to install a torrent client to make use of this file.
"""
)

In [83]:
#Write a pattern that matches a form of download plus proper noun
pattern =[{"LEMMA": "download"}, {"POS": "PROPN"}]

In [84]:
#Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", [pattern])
matches =matcher(doc)
print("Total matches found:", len(matches))

Total matches found: 2


In [85]:
#Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Match found: download GIMP
Match found: downloading GIMP


### Shared vocab and string store

In [86]:
#vocab stores data shared across multiple documents
#to save memory spaCy encodes all strings to hash values
#strings are stored only once in the StringStore via nlp.vocab.strings
#string store: lookup table in both directions

In [87]:
coffee_hash = nlp.vocab.strings["coffee"]
coffee_string = nlp.vocab.strings[coffee_hash]

In [88]:
#hashes can't be reversed, thats why we need to provide the shared vocab

In [102]:
#raises an error if we haven't seen the string before
string = nlp.vocab.strings[324134665]

KeyError: "[E018] Can't retrieve string for hash '324134665'. This usually refers to an issue with the `Vocab` or `StringStore`."

### Shared vocab and string store

In [103]:
#look up string and hash in the nlp.vocab.store

In [105]:
doc = nlp("I love coffee")
print("hash value:", nlp.vocab.strings["coffee"])


hash value: 3197928453018144401


In [107]:
print("string value:", nlp.vocab.strings[3197928453018144401])

string value: coffee


### Lexemes: entries is the vocabulary

In [112]:
#a Lexeme object is an entry in the vocabulary

doc = nlp("I love coffee")
lexeme = nlp.vocab["coffee"]

In [113]:
#Print lexical attributes
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

coffee 3197928453018144401 True


In [None]:
#contains context-independent information about a word
#word text: lexeme.text and lexeme.orth (the hash)
#lexeme attributes like lexeme.is_alpha