In [None]:
# All pipeline packages you can load into spaCy include several files and a config.cfg
# Built-in components need binary data to make predictions
# Pipeline attributes:
    # nlp.pipe_names - list pipeline component names
    # nlp.pipeline - list of (name, component) tuples
# The tokenizer turns a string of text into a Doc object. spaCy then applies every component in the pipeline on document, in order.

In [1]:
import spacy

# Load the en_core_web_sm pipeline
nlp = spacy.load('en_core_web_sm')

# Print the names of the pipeline components
print(nlp.pipe_names)

# Print the full pipeline of (name, component) tuples
print(nlp.pipeline)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x0000029B89CCF830>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x0000029B89B9ABD0>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x0000029B89BCA570>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x0000029B89E96A10>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x0000029B89E79C10>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x0000029B89BCA1F0>)]


# Spacy Custom Components:

In [2]:
# Make a function execute automatically when you call nlp
# Add your own metadata to documents and tokens
# Updating built-in attributes like doc.ents

# Custom component anatomy:
# Function that takes a doc, modifies it and returns it
# Registered using the Language.component decorator
# Can be added using the nlp.add_pipe method, takes two parameters, 
    # "component name"
    # position in the pipeline last | first boolean OR before | after "name of existing component"

In [3]:
from spacy.language import Language

# Create the nlp object
nlp = spacy.load("en_core_web_sm")

# Define a custom component
@Language.component("custom_component")
def custom_component_function(doc):
    # Print the doc's length
    print("Doc length:", len(doc))
    # Return the doc object
    return doc

# Add the component first in the pipeline
nlp.add_pipe("custom_component", first=True)

# Print the pipeline component names
print("Pipeline:", nlp.pipe_names)

# Process a text
doc = nlp("Hello world!")

Pipeline: ['custom_component', 'tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [7]:
import spacy
from spacy.language import Language
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print(f'animal patterns: {animal_patterns}')
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", animal_patterns)

# Define the custom component
@Language.component("animal_component")
def animal_component_function(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc


# Add the component to the pipeline after the "ner" component
nlp.add_pipe("animal_component", after="ner")
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

animal patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'animal_component']
[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


# Custom Attributes

In [8]:
# Add to the Doc, Token and Span objects to store custom data
# doc._.title = "My document"
# token._.is_color = True
# span._.has_color = False

# Registered on the global Doc, Token or Span using the set_extension method
# Import global classes
from spacy.tokens import Doc, Token, Span
# Set extensions on the Doc, Token and Span
# Doc.set_extension("title", default=None)
# Token.set_extension("is_color", default=False)
# Span.set_extension("has_color", default=False)

# There are three types of extensions: attribute extensions, property extensions and method extensions.
# 1. attribute extensions
# Set extension on the Token with default value
Token.set_extension("is_color", default=False)
doc = nlp("The sky is blue.")
Overwrite extension attribute value
doc[3]._.is_color = True

# 2. property extensions
# Define a getter and an optional setter function
# Getter only called when you retrieve the attribute value
# Define getter function
def get_is_color(token):
    colors = ["red", "yellow", "blue"]
    return token.text in colors

# Set extension on the Token with getter
Token.set_extension("is_color", getter=get_is_color)

doc = nlp("The sky is blue.")
print(doc[3]._.is_color, "-", doc[3].text)

# Span extensions should almost always use a getter
# Define getter function
def get_has_color(span):
    colors = ["red", "yellow", "blue"]
    return any(token.text in colors for token in span)

# Set extension on the Span with getter
Span.set_extension("has_color", getter=get_has_color)
doc = nlp("The sky is blue.")
print(doc[1:4]._.has_color, "-", doc[1:4].text)
print(doc[0:2]._.has_color, "-", doc[0:2].text)

# 3. Method extensions
# Assign a function that becomes available as an object method
# Lets you pass arguments to the extension function
# Define method with arguments, first arg is always the object itself
def has_token(doc, token_text):
    in_doc = token_text in [token.text for token in doc]
    return in_doc

# Set extension on the Doc with method
Doc.set_extension("has_token", method=has_token)

doc = nlp("The sky is blue.")
print(doc._.has_token("blue"), "- blue")
print(doc._.has_token("cloud"), "- cloud")

In [None]:
nlp = spacy.blank("en")

# Define the getter function that takes a token and returns its reversed text
def get_reversed(token):
    return token.text[::-1]


# Register the Token property extension "reversed" with the getter get_reversed
Token.set_extension("reversed", getter=get_reversed)

# Process the text and print the reversed attribute for each token
doc = nlp("All generalizations are false, including this one.")
for token in doc:
    print("reversed:", token._.reversed)

In [None]:
nlp = spacy.blank("en")

# Define the getter function
def get_has_number(doc):
    # Return if any of the tokens in the doc return True for token.like_num
    return any(token.like_num for token in doc)


# Register the Doc property extension "has_number" with the getter get_has_number
Doc.set_extension("has_number", getter=get_has_number)

# Process the text and check the custom has_number attribute
doc = nlp("The museum closed for five years in 2012.")
print("has_number:", doc._.has_number)

In [None]:
nlp = spacy.blank("en")

# Define the method
def to_html(span, tag):
    # Wrap the span text in a HTML tag and return it
    return f"<{tag}>{span.text}</{tag}>"


# Register the Span method extension "to_html" with the method to_html
Span.set_extension("to_html", method=to_html)

# Process the text and call the to_html method on the span with the tag name "strong"
doc = nlp("Hello world, this is a sentence.")
span = doc[0:2]
print(span._.to_html("strong"))

In [3]:
import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")


def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text


# Set the Span extension wikipedia_url using the getter get_wikipedia_url
Span.set_extension("wikipedia_url", getter=get_wikipedia_url)

doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent._.wikipedia_url)

ValueError: [E090] Extension 'wikipedia_url' already exists on Span. To overwrite the existing extension, set `force=True` on `Span.set_extension`.

In [None]:
import json
import spacy
from spacy.language import Language
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("exercises/en/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

# Reads a JSON file mapping country names to their capitals (a dictionary).
with open("exercises/en/capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

# Creates a blank spaCy NLP pipeline for English, which starts with no components.
nlp = spacy.blank("en")
# PhraseMatcher is created to detect country names in text.
matcher = PhraseMatcher(nlp.vocab)
# Converts each country name into a Doc object for efficient matching.
# associates the name "COUNTRY" with these patterns
matcher.add("COUNTRY", list(nlp.pipe(COUNTRIES)))

# custom pipeline component 
@Language.component("countries_component")
def countries_component_function(doc):
  	# Use the matcher to find country names in a document.
    matches = matcher(doc)
    # Create an entity Span with the label "GPE" for all matches
    # Add these spans to doc.ents (the list of recognized entities in the document)
    doc.ents = [Span(doc, start, end, label="GPE") for match_id, start, end in matches]
    return doc


# Add the component to the pipeline
nlp.add_pipe("countries_component")
print(nlp.pipe_names)

# Getter that looks up the span text in the dictionary of country capitals
get_capital = lambda span: CAPITALS.get(span.text)

# getter is defined for the capital attribute using a lambda function
# Register the Span extension attribute "capital" with the getter get_capital
# Whenever the capital attribute is accessed on a Span, it looks up the span's text in 
# the CAPITALS dictionary and returns the corresponding value
Span.set_extension("capital", getter=get_capital)

# Process the text and print the entity text, label and capital attributes
doc = nlp("Czech Republic may help Slovakia protect its airspace")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

# Scaling & Performance

In [None]:
# 1. Process large volumes of text
# Processes texts as a stream, yields Doc objects
docs = list(nlp.pipe(LOTS_OF_TEXTS))
# Setting as_tuples=True on nlp.pipe lets you pass in (text, context) tuples
# Yields (doc, context) tuples
# Useful for associating metadata with the doc
data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    print(doc.text, context["page_number"])

# add metadata as custom attribute 
from spacy.tokens import Doc

Doc.set_extension("id", default=None)
Doc.set_extension("page_number", default=None)

data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    doc._.id = context["id"]
    doc._.page_number = context["page_number"]

# 2. Use only the tokenizer
# Use nlp.make_doc to turn a text into a Doc object
doc = nlp.make_doc("Hello world!")
# Disable tagger and parser
with nlp.select_pipes(disable=["tagger", "parser"]):
    # Process the text and print the entities
    doc = nlp(text)
    print(doc.ents)
# Restores them after the with block
# Only runs the remaining components

In [None]:
import json
import spacy

nlp = spacy.load("en_core_web_sm")

with open("exercises/en/tweets.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the adjectives
for doc in nlp.pipe(TEXTS):
    print([token.text for token in doc if token.pos_ == "ADJ"])

In [None]:
import json
import spacy

nlp = spacy.load("en_core_web_sm")

with open("exercises/en/tweets.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the entities
docs = list(nlp.pipe(TEXTS))
entities = [doc.ents for doc in docs]
print(*entities)

In [None]:
# Using custom attributes to add author and book meta information to quotes.
# List of [text, context] examples is available as the variable DATA. 
# The texts are quotes from famous books, and the contexts dictionaries with the keys "author" and "book".
# Use the set_extension method to register the custom attributes "author" and "book" on the Doc, which default to None.
# Process the [text, context] pairs in DATA using nlp.pipe with as_tuples=True.
# Overwrite the doc._.book and doc._.author with the respective info passed in as the context.

import json
import spacy
from spacy.tokens import Doc

with open("exercises/en/bookquotes.json", encoding="utf8") as f:
    DATA = json.loads(f.read())

nlp = spacy.blank("en")

# Register the Doc extension "author" (default None)
Doc.set_extension("author", default=None)

# Register the Doc extension "book" (default None)
Doc.set_extension("book", default=None)

for doc, context in nlp.pipe(DATA, as_tuple=True):
    # Set the doc._.book and doc._.author attributes from the context
    doc._.book = context["book"]
    doc._.author = context["author"]

    # Print the text and custom attribute data
    print(f"{doc.text}\n — '{doc._.book}' by {doc._.author}\n")

In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = (
    "Chick-fil-A is an American fast food restaurant chain headquartered in "
    "the city of College Park, Georgia, specializing in chicken sandwiches."
)

# Only tokenize the text
doc = nlp.make_doc(text)
print([token.text for token in doc])

['Chick', '-', 'fil', '-', 'A', 'is', 'an', 'American', 'fast', 'food', 'restaurant', 'chain', 'headquartered', 'in', 'the', 'city', 'of', 'College', 'Park', ',', 'Georgia', ',', 'specializing', 'in', 'chicken', 'sandwiches', '.']


In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = (
    "Chick-fil-A is an American fast food restaurant chain headquartered in "
    "the city of College Park, Georgia, specializing in chicken sandwiches."
)

# Disable the tagger and lemmatizer
with nlp.select_pipes(disable=["tagger", "lemmatizer"]):
    # Process the text
    doc = nlp(text)
    # Print the entities in the doc
    print(doc.ents)

(American, College Park, Georgia)
