In [7]:
#!python -m spacy download en_core_web_md

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
     ---------------------------------------- 0.0/33.5 MB ? eta -:--:--
     --- ------------------------------------ 2.6/33.5 MB 15.1 MB/s eta 0:00:03
     -------- ------------------------------- 7.3/33.5 MB 18.9 MB/s eta 0:00:02
     ------------- ------------------------- 11.5/33.5 MB 19.0 MB/s eta 0:00:02
     --------------------- ----------------- 18.1/33.5 MB 22.0 MB/s eta 0:00:01
     -------------------------- ------------ 22.8/33.5 MB 21.9 MB/s eta 0:00:01
     ---------------------------------- ---- 29.6/33.5 MB 23.5 MB/s eta 0:00:01
     --------------------------------------  33.3/33.5 MB 24.0 MB/s eta 0:00:01
     --------------------------------------- 33.5/33.5 MB 22.1 MB/s eta 0:00:00
Installing collected packages: en-

In [1]:
import spacy

KeyboardInterrupt: 

In [None]:
# A Lexeme object is an entry in the vocabulary
# Contains the context-independent information about a word
# Word text: lexeme.text and lexeme.orth (the hash)
# Lexical attributes like lexeme.is_alpha
# Not context-dependent part-of-speech tags, dependencies or entity labels

# The Doc contains words in context – in this case, the tokens "I", "love" and "coffee" with their part-of-speech tags and dependencies.
# Each token refers to a lexeme, which knows the word's hash ID. To get the string representation of the word, spaCy looks up the hash in the 
# string store.

In [3]:
nlp = spacy.blank("en")
doc = nlp("I have a cat")

# Look up the hash for the word "cat"
cat_hash = nlp.vocab.strings["cat"]
print(cat_hash)

# Look up the cat_hash to get the string
cat_string = nlp.vocab.strings[cat_hash]
print(cat_string)

5439657043933447811
cat


In [4]:
# Data Structure - Doc and it's views Token and Span
# The Doc class takes three arguments: the shared vocab, the words and the spaces
# A Span is a slice of a doc consisting of one or more tokens. 
# Span three arguments: the doc it refers to, and the start and end index of the span. Remember that the end index is exclusive!

# Create an nlp object
import spacy
nlp = spacy.blank("en")

# Import the Doc class
from spacy.tokens import Doc

# The words and spaces to create the doc from
words = ["Hello", "world", "!"]
spaces = [True, False, False]

# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)

# Import the Doc and Span classes
from spacy.tokens import Doc, Span

# The words and spaces to create the doc from
words = ["Hello", "world", "!"]
spaces = [True, False, False]

# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)

# Create a span manually
span = Span(doc, 0, 2)

# Create a span with a label
span_with_label = Span(doc, 0, 2, label="GREETING")

# Add span to the doc.ents
doc.ents = [span_with_label]

# Best Practices
* Doc and Span are very powerful and hold references and relationships of words and sentences
* Convert result to strings as late as possible
* Use token attributes if available – for example, token.i for the token index
* Don't forget to pass in the shared vocab

In [5]:
doc = nlp("Berlin looks like a nice city")

for index, token in enumerate(doc):
    # Check if the current token is a proper noun
    if token.pos_ == nlp.vocab.strings["PROPN"]:
        # Check if the next token is a verb
        if index + 1 < len(doc) and doc[index + 1].pos_ == "VERB":
            result = nlp.vocab.strings[token]
            print("Found proper noun before a verb:", result)

In [10]:
# Word vectors and semantic similarity 
# compare two objects and predict how similar they are – for example, documents, spans or single tokens.
# Doc, Token and Span objects have a .similarity method that takes another object and returns a floating point number between 0 and 1, 
# indicating how similar they are.
# In order to use similarity, you need a larger spaCy pipeline that has word vectors included (medium or large English pipeline ends in "md" or "lg")

# Load a larger pipeline with vectors
nlp = spacy.load("en_core_web_md")

# Compare two documents
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")
print(doc1.similarity(doc2))

# Compare two tokens
doc = nlp("I like pizza and pasta")
token1 = doc[2]
token2 = doc[4]
print(token1.similarity(token2))

# Compare a document with a token
doc = nlp("I like pizza")
token = nlp("soap")[0]

print(doc.similarity(token))

# Compare a span with a document
span = nlp("I like pizza and pasta")[2:5]
doc = nlp("McDonalds sells burgers")

print(span.similarity(doc))

0.8382381200790405
1.0
0.2274085134267807
0.5528544783592224


## How does spaCy predict similarity?
* Similarity is determined using word vectors
* Multi-dimensional meaning representations of words
* Generated using an algorithm like Word2Vec and lots of text
* Can be added to spaCy's pipelines
* Default: cosine similarity, but can be adjusted
* Doc and Span vectors default to average of token vectors
* Short phrases are better than long documents with many irrelevant words

In [11]:
doc = nlp("I have a banana")
# Access the vector via the token.vector attribute
print(doc[3].vector[:10])

[-0.6334    0.18981  -0.53544  -0.52658  -0.30001   0.30559  -0.49303
  0.14636   0.012273  0.96802 ]


In [12]:
doc1 = nlp("I like cats")
doc2 = nlp("I hate cats")

print(doc1.similarity(doc2)) # very similar silarity score but in fact they are different 

1.0


# Combining predictions from statistical models with rule-based systems 
* Statistical models:
  * Use cases	application needs to generalize based on examples
  * Real-world examples	product names, person names, subject/object relationships
  * spaCy features	entity recognizer, dependency parser, part-of-speech tagger
* Rule-based systems:
  * dictionary with finite number of examples
  * countries of the world, cities, drug names, dog breeds
  * tokenizer, Matcher, PhraseMatcher

In [None]:
# Rule-Based Systems
# Initialize with the shared vocab
matcher = Matcher(nlp.vocab)

# Patterns are lists of dictionaries describing the tokens, LOWER is case insensative
pattern = [{"LEMMA": "love", "POS": "VERB"}, {"LOWER": "cats"}]
matcher.add("LOVE_CATS", [pattern])

# Operators can specify how often a token should be matched
pattern = [{"TEXT": "very", "OP": "+"}, {"TEXT": "happy"}]
matcher.add("VERY_HAPPY", [pattern])

# Calling matcher on doc returns list of (match_id, start, end) tuples
doc = nlp("I love cats and I'm very very happy")
matches = matcher(doc)

# Statistical models
matcher = Matcher(nlp.vocab)
matcher.add("DOG", [[{"LOWER": "golden"}, {"LOWER": "retriever"}]])
doc = nlp("I have a Golden Retriever")

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print("Matched span:", span.text)
    # Get the span's root token and root head token
    print("Root token:", span.root.text)
    print("Root head token:", span.root.head.text)
    # Get the previous token and its POS tag
    print("Previous token:", doc[start - 1].text, doc[start - 1].pos_)

# PhraseMatcher 
* Like regular expressions or keyword search – with access to the tokens!* 
Takes Doc object as pattern
* More efficient and faster than the Matcher
* Great for matching large word listssts

In [None]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

pattern = nlp("Golden Retriever") #instead of a dictionary, pass in a document 
matcher.add("DOG", [pattern])
doc = nlp("I have a Golden Retriever")

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Get the matched span
    span = doc[start:end]
    print("Matched span:", span.text)

In [None]:
# Exact match with json file with list of countries
import json
import spacy

with open("exercises/en/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

nlp = spacy.blank("en")
doc = nlp("Czech Republic may help Slovakia protect its airspace")

# Import the PhraseMatcher and initialize it
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

# Create pattern Doc objects and add them to the matcher
# This is the faster version of: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", patterns)

# Call the matcher on the test document and print the result
matches = matcher(doc)
print([doc[start:end] for match_id, start, end in matches])

In [None]:
for lexeme in nlp.vocab:
    print(lexeme[1:2])

In [None]:
# For each country name matched, the root head token and the country name are printed. This helps understand the syntactic relationship in the sentence.
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
import json

# Load data
with open("exercises/en/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())
with open("exercises/en/country_text.txt", encoding="utf8") as f:
    TEXT = f.read()

# Load language model
nlp = spacy.load("en_core_web_sm")
# Initialize SpaCy tool for matching pharases
matcher = PhraseMatcher(nlp.vocab)
# nlp.pipe process COUNTRIES list as tokens
patterns = list(nlp.pipe(COUNTRIES))

matcher.add("COUNTRY", patterns)

# Create a doc and reset existing entities
doc = nlp(TEXT)
doc.ents = []

# Iterate over the matches, matcher applied to doc
# start and end: Indices of the matched tokens in doc
for match_id, start, end in matcher(doc):
    # Create a Span with the label for "GPE"
    span = Span(doc, start, end, label="GPE")

    # Overwrite the doc.ents and add the span
    doc.ents = list(doc.ents) + [span]

    # Get the span's root head token
    # This token is the syntactic "head" of the span (e.g., the main verb or noun it's linked to).
    span_root_head = span.root.head
    # Print the text of the span root's head token and the span text
    # This provides context for how the country name is used in the sentence.
    print(span_root_head.text, "-->", span.text)

# Print the entities in the document
print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "GPE"])