# In-Depth Tutorial to spaCy For Beginners
![](images/pexels.jpg)
<figcaption style="text-align: center;">
    <strong>
        Photo by 
        <a href='https://www.pexels.com/photo/red-blue-and-white-fluid-abstract-painting-2317710/'>Anni Roenkae</a>
    </strong>
</figcaption>


### Introduction

### What is spaCy?

### Basics of spaCy

```bash
pip install -U pip setuptools wheel
pip install -U spacy
```

After installing the core library with the above functions...

```bash
python -m spacy download en_core_web_sm  # 12 MB
python -m spacy download en_core_web_md  # 31 MB
python -m spacy download en_core_web_lg  # 382 MB
```

In [1]:
import spacy

txt = "The tallest living man is 37-year-old Sultan Kosen, from Turkey, who is 8 feet, 2.8 inches, who set the record in 2009."

# Create the Language object
nlp = spacy.load("en_core_web_sm")

# Create the Doc object
doc = nlp(txt)


for token in doc[:5]:
    print(token)

The
tallest
living
man
is


In [2]:
type(token)

spacy.tokens.token.Token

In [3]:
len(doc)

31

In [4]:
print("Index: ", [token.i for token in doc[3:10]])
print("Text: ", [token.text for token in doc[3:10]])
print("is_alpha:", [token.is_alpha for token in doc[3:10]])
print("is_punct:", [token.is_punct for token in doc[3:10]])
print("like_num:", [token.like_num for token in doc[3:10]])
print("Base word:", [token.lemma_ for token in doc[3:10]])

Index:  [3, 4, 5, 6, 7, 8, 9]
Text:  ['man', 'is', '37', '-', 'year', '-', 'old']
is_alpha: [True, True, False, False, True, False, True]
is_punct: [False, False, False, True, False, True, False]
like_num: [False, False, True, False, False, False, False]
Base word: ['man', 'be', '37', '-', 'year', '-', 'old']


In [5]:
for ent in doc.ents:
    print(ent.text, "-->", ent.label_)

37-year-old --> DATE
Sultan Kosen --> PERSON
Turkey --> GPE
8 feet --> QUANTITY
2.8 inches --> QUANTITY
2009 --> DATE


### Architecture and core data structures

In [6]:
import spacy

nlp = spacy.load("en_core_web_md")

type(nlp)

spacy.lang.en.English

```python
nlp = spacy.load("es_core_news_sm")  # Spanish
nlp = spacy.load("ru_core_news_sm")  # Russian
nlp = spacy.load("zh_core_web_sm")  # Chinese
nlp = spacy.load("de_core_news_sm")  # German
```

In [7]:
from spacy.lang.en import English
from spacy.lang.es import Spanish

nlp = English()

type(nlp)

spacy.lang.en.English

In [8]:
type(nlp.vocab)

spacy.vocab.Vocab

In [9]:
txt = """The original name for the search engine Google was Backrub. 
         It was renamed Google after the googol, 
         which is the number one followed by 100 zeros."""

doc = nlp(txt)

In [10]:
type(doc.vocab)

spacy.vocab.Vocab

In [11]:
nlp.vocab.strings["google"]

1988622737398120358

In [12]:
nlp.vocab.strings[1988622737398120358]

'google'

In [13]:
lexeme = nlp.vocab["google"]

type(lexeme)

spacy.lexeme.Lexeme

In [14]:
print(lexeme.text, lexeme.orth, lexeme.is_digit)

google 1988622737398120358 False


In [15]:
txt = """Mosquitoes are the deadliest animal in the world: 
         They kill more people than any other creature, 
         due to the diseases they carry."""

doc = nlp(txt)

type(doc)

spacy.tokens.doc.Doc

In [16]:
doc.text

'Mosquitoes are the deadliest animal in the world: \n         They kill more people than any other creature, \n         due to the diseases they carry.'

In [17]:
len(doc)

27

In [18]:
from spacy.lang.en import English
from spacy.tokens import Doc

nlp = English()

words = ["I", "love", "Barcelona", "!"]
spaces = [True, True, False, False]

# Create the doc object manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)

In [19]:
len(doc)

4

In [20]:
doc.text

'I love Barcelona!'

In [21]:
txt = """The hardest working muscle in your body is your heart: 
         It pumps more than 2,000 gallons of blood a day
         and beats more than 2.5 billion times in a 70-year life span."""

doc = nlp(txt)

span = doc[:10]

type(span)

spacy.tokens.span.Span

In [22]:
print(span.text)
print(span.start, span.end)

The hardest working muscle in your body is your heart
0 10


In [23]:
from spacy.tokens import Span

span = Span(doc, 0, 10)
span.text

'The hardest working muscle in your body is your heart'

### Predicting named entities (NER)

In [24]:
txt = """Cleopatra wasn’t actually Egyptian! 
         As far as historians can tell, Egypt’s 
         famous femme fatal was actually Greek!. 
         She was a descendant of Alexander the Great’s
         Macedonian general Ptolemy"""

nlp = spacy.load("en_core_web_md")

doc = nlp(txt)

In [25]:
for ent in doc.ents:
    print(f"{ent.text:<20}{ent.label_:<20}")

Cleopatra           PERSON              
Egyptian            NORP                
Egypt               GPE                 
Greek               NORP                
Macedonian          NORP                
Ptolemy             PERSON              


In [26]:
from spacy import displacy

displacy.render(doc, style="ent")

In [27]:
spacy.explain("GPE")

'Countries, cities, states'

In [28]:
spacy.explain("NORP")

'Nationalities or religious or political groups'

In [29]:
from spacy.tokens import Span

alexander = Span(doc, 31, 34, label="PERSON")

doc.ents = list(doc.ents) + [alexander]

In [30]:
displacy.render(doc, style="ent")

### Predicting part-of-speech (POS) tags and syntactic dependencies

In [31]:
txt = "The first footprints on the moon will remain there for a million years"

doc = nlp(txt)

for token in doc:
    print(f"{token.text:<20} {token.pos_:<20} {token.dep_:<20}")

The                  DET                  det                 
first                ADJ                  amod                
footprints           NOUN                 nsubj               
on                   ADP                  prep                
the                  DET                  det                 
moon                 NOUN                 pobj                
will                 AUX                  aux                 
remain               VERB                 ROOT                
there                ADV                  advmod              
for                  ADP                  prep                
a                    DET                  quantmod            
million              NUM                  nummod              
years                NOUN                 pobj                


In [32]:
pos_tags = ["DET", "AUX", "ADP"]
dep_tags = ["amod", "nsubj", "nummod"]

for pos in pos_tags:
    print(pos, "-->", spacy.explain(pos))

for dep in dep_tags:
    print(dep, "-->", spacy.explain(dep))

DET --> determiner
AUX --> auxiliary
ADP --> adposition
amod --> adjectival modifier
nsubj --> nominal subject
nummod --> numeric modifier


In [33]:
displacy.render(doc, style="dep")

In [34]:
txt = """The teddy bear is named after President Theodore Roosevelt. 
         After he refused to shoot a captured black bear on a hunt, 
         a stuffed-animal maker decided to create
         a bear and name it after the president."""

doc = nlp(txt)

for chunk in doc.noun_chunks:
    print(chunk.text)

The teddy bear
President Theodore Roosevelt
he
a captured black bear
a hunt
a stuffed-animal maker
a bear
it
the president


### Custom rule-based tokenization

In [35]:
txt = """Cleopatra wasn’t actually Egyptian! 
         As far as historians can tell, Egypt’s 
         famous femme fatal was actually Greek!. 
         She was a descendant of Alexander the Great’s
         Macedonian general Ptolemy"""

# Create a pattern
pattern = [
    {"IS_ALPHA": True, "IS_TITLE": True},
    {"IS_STOP": True},
    {"IS_ALPHA": True, "IS_TITLE": True},
]

In [36]:
from spacy.matcher import Matcher

# Init the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
matcher.add("TITLED_PERSON", [pattern])

# Process the text
doc = nlp(txt)

# Find all matches
matches = matcher(doc)

# Iterate over matches
for match_id, start, end in matches:
    # Get the span
    span = doc[start:end]
    print(span.text)

Alexander the Great


In [37]:
pattern = [
    {"TEXT": {"REGEX": "[a-zA-Z]+"}},
    {"IS_DIGIT": True, "OP": "?"},  # Match one or more times
    {"DEP": "quantmod"},  # Match based on dependency
    # etc.
]

### Word vectors and semantic similarity

In [38]:
nlp = spacy.load("en_core_web_md")

doc1 = nlp("What a lukeworm sentiment.")
doc2 = nlp("What a short sentence.")

doc1.similarity(doc2)

0.9200780919749721

In [39]:
doc1[-1].similarity(doc2[2])

0.5222234129905701

In [40]:
doc1[0:2].similarity(doc[3])

0.8700238466262817

In [41]:
array = doc1[0].vector

array.shape

(300,)

### All about pipelines

In [42]:
nlp = spacy.load("en_core_web_sm")

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [43]:
from spacy.language import Language


@Language.component("your_component")
def your_component(doc):
    # Do something on the doc
    print(f"There are {len(doc)} tokens in this text.")

    return doc

In [44]:
nlp.add_pipe("your_component")

<function __main__.your_component(doc)>

In [45]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'your_component']

In [46]:
doc = nlp("Bird dies, but you remember the flight.")

There are 9 tokens in this text.


In [47]:
from spacy.language import Language
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")


@Language.component("titled_person")
def titled_person(doc):
    pattern = [
        {"IS_ALPHA": True, "IS_TITLE": True},
        {"IS_STOP": True},
        {"IS_ALPHA": True, "IS_TITLE": True},
    ]
    # Create the matcher
    matcher = Matcher(doc.vocab)
    # Add the pattern
    matcher.add("TITLED_PERSON", [pattern])

    matches = matcher(doc)
    matched_spans = [Span(doc, start, end) for _, start, end in matches]

    # Add the matched spans to doc's entities
    doc.ents = list(doc.ents) + matched_spans

    return doc


nlp.add_pipe("titled_person")

<function __main__.titled_person(doc)>

In [48]:
txt = """Cleopatra wasn’t actually Egyptian! 
         As far as historians can tell, Egypt’s 
         famous femme fatal was actually Greek!. 
         She was a descendant of Alexander the Great’s
         Macedonian general Ptolemy"""

doc = nlp(txt)

nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'titled_person']

### Setting custom extensions (metadata)

### Conclusion