# spaCy

## Basics of spaCy

In [2]:
import spacy

txt = "The tallest living man is 37-year-old Sultan Kosen, from Turkey, who is 8 feet, 2.8 inches, who set the record in 2009."

# Create the Language object
# convention to name any loaded language models 'nlp' in spaCy
nlp = spacy.load("en_core_web_sm")
nlp

<spacy.lang.en.English at 0x207a4fcde50>

In [4]:
# this is how we create a doc object
doc = nlp(txt)
doc

The tallest living man is 37-year-old Sultan Kosen, from Turkey, who is 8 feet, 2.8 inches, who set the record in 2009.

In [8]:
# a doc object is a convention and an iterator
# slicing and indexing notations can be used to extract individual tokens
for token in doc[:5]:
    print(token)
print(f'\nText length: {len(doc)}')
print(type(doc[0]))

The
tallest
living
man
is

Text length: 31
<class 'spacy.tokens.token.Token'>


In [10]:
# if more than one token is extracted we have a span object
span = doc[:5]
print(type(span))

span.text

<class 'spacy.tokens.span.Span'>


'The tallest living man is'

In [17]:
# spaCy is memory efficient so token and span are just views of doc object there is no duplication
# There are also 6 prebuilt in lexical attributes
print("Index:  ", [token.i for token in doc[3:10]])
print("Text:  ", [token.text for token in doc[3:10]])
print("is_alpha:  ", [token.is_alpha for token in doc[3:10]])
print("is_punct:  ", [token.is_punct for token in doc[3:10]])
# like_num recognizes both literal and lettered numbers
print("like_num:  ", [token.like_num for token in doc[3:10]])
# returns base word stripped from any suffixes, prefixes, tense, or other grammatical attributes
print("Base word: ", [token.lemma_ for token in doc[3:10]])

Index:   [3, 4, 5, 6, 7, 8, 9]
Text:   ['man', 'is', '37', '-', 'year', '-', 'old']
is_alpha:   [True, True, False, False, True, False, True]
is_punct:   [False, False, False, True, False, True, False]
like_num:   [False, False, True, False, False, False, False]
Base word:  ['man', 'be', '37', '-', 'year', '-', 'old']


## Architecture and core data structures

In [18]:
import spacy
# we load in a language object which are pre-trained on millions of text instances
# available in 22 languages
nlp = spacy.load("en_core_web_md")

type(nlp)

spacy.lang.en.English

In [21]:
# language models can be directly imported from the lan sub-module
from spacy.lang.en import English
from spacy.lang.es import Spanish

nlp = English()

print(type(nlp))

txt = """The original name for the search engine Google was Backrub. 
         It was renamed Google after the googol, 
         which is the number one followed by 100 zeros."""

doc = nlp(txt)

<class 'spacy.lang.en.English'>


In [25]:
# after processing text the words and punct are stored in the vocab object of nlp
# vocab is shared between docs so all new words are stored in the same object
print(f'nlp vocab: {type(nlp.vocab)}')
print(f'doc vocab: {type(doc.vocab)}')

nlp vocab: <class 'spacy.vocab.Vocab'>
doc vocab: <class 'spacy.vocab.Vocab'>
