# A Quick Start by Following the Steps in the [DataCamp Website](https://www.datacamp.com/community/blog/spacy-cheatsheet)

In [1]:
# Download statistical models
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [2]:
# Check that your installed models are up to date
!python -m spacy validate

[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation:
/home/ubuntu/anaconda3/lib/python3.7/site-packages/spacy[0m

TYPE      NAME             MODEL            VERSION                            
package   en-core-web-sm   en_core_web_sm   [38;5;2m2.1.0[0m   [38;5;2m✔[0m
link      en               en_core_web_sm   [38;5;2m2.1.0[0m   [38;5;2m✔[0m



In [3]:
import spacy

# Load Model
If loading fails, restart the kernel would work.

In [4]:
# load the installed model 
nlp = spacy.load("en_core_web_sm")

# Preprocess

In [5]:
doc = nlp('This is a text')
type(doc)

spacy.tokens.doc.Doc

In [6]:
for token in doc:
    print(type(token))
    print(token.text)

<class 'spacy.tokens.token.Token'>
This
<class 'spacy.tokens.token.Token'>
is
<class 'spacy.tokens.token.Token'>
a
<class 'spacy.tokens.token.Token'>
text


# Span

In [7]:
print(doc[0])
print(type(doc[0]))

This
<class 'spacy.tokens.token.Token'>


### Creating a span manually

In [8]:
doc = nlp("I live in New York")

In [9]:
[token.text for token in doc]

['I', 'live', 'in', 'New', 'York']

In [10]:
span = spacy.tokens.Span(doc, 3, 5, label='GPE')

In [11]:
span.text

'New York'

# POS (Part-Of-Speech) tags (predicted by statistical model)

In [12]:
doc = nlp('This is a text.')

In [13]:
[token.pos_ for token in doc]

['DET', 'VERB', 'DET', 'NOUN', 'PUNCT']

In [14]:
[token.pos for token in doc]

[90, 100, 90, 92, 97]

#### Fine-Grained POS tags

In [15]:
[token.tag_ for token in doc]

['DT', 'VBZ', 'DT', 'NN', '.']

# Syntactic Dependencies (predicted by statiscal model)

In [16]:
doc = nlp('This is a text.')

In [17]:
[token.dep_ for token in doc]

['nsubj', 'ROOT', 'det', 'attr', 'punct']

#### Syntactic head token (governor)

In [18]:
[token.head.text for token in doc]

['is', 'is', 'text', 'is', 'is']

# Named Entity (predicted by statistical model)

In [19]:
doc = nlp('Larry Page founded Google')

In [20]:
[(ent.text, ent.label_) for ent in doc.ents]

[('Larry Page', 'PERSON'), ('Google', 'ORG')]

# Sentences (usually needs the dependency parser)

In [21]:
doc = nlp('This is a sentence. This is another one.')

In [22]:
[sent.text for sent in doc.sents]

['This is a sentence.', 'This is another one.']

# Base noun phrases (needs the tagger and parser)

In [23]:
doc = nlp('I have a red car')

In [24]:
# doc.noun_chunks is a generator that yields spans

[chuck.text for chuck in doc.noun_chunks]

['I', 'a red car']

# Label explanations

In [25]:
spacy.explain('RB')

'adverb'

In [26]:
spacy.explain('GPE')

'Countries, cities, states'

In [27]:
spacy.explain('DT')

'determiner'

# Visualizing
If you're in a Jupyter notebook, use ```displacy.render```. Otherwise, use ```displacy.serve``` to start a web server and show the visualization in your browser.

#### Visualize dependencies

In [28]:
doc = nlp('This is a sentence.')

spacy.displacy.render(doc, style='dep')

In [29]:
spacy.explain('nsubj')

'nominal subject'

In [30]:
spacy.explain('attr')

'attribute'

#### Visualizing named entities

In [31]:
doc = nlp('Larry Page founded Google')

spacy.displacy.render(doc, style='ent')

# Word Vectors and Similarity
To use word vectors, you need to install the larger models ending in __md__ or __lg__ , for example ```en_core_web_lg```

In [32]:
doc1 = nlp('I like cats')
doc2 = nlp('I like dogs')

doc1.similarity(doc2)

  "__main__", mod_spec)


0.9494251457336707

In [33]:
print(doc1[2])
print(doc2[2])

cats
dogs


In [34]:
doc1[2].similarity(doc2[2])

  "__main__", mod_spec)


0.85979897

#### Accessing word vectors

In [35]:
doc = nlp('I like cats')

print(len(doc[2].vector))

96


In [36]:
doc[2].vector

array([ 3.6124864 , -0.00959456,  0.90754664, -3.7867062 ,  2.4655213 ,
        0.13715684,  2.415557  , -2.5730793 , -2.8679817 ,  3.252561  ,
       -1.2344291 ,  1.8056612 , -1.9664276 , -0.7356212 , -2.7180357 ,
       -0.6440706 , -2.8764105 ,  3.059177  , -1.2157369 ,  1.2292784 ,
       -3.8869352 ,  1.5334756 , -0.42812008, -1.6634891 , -0.71992075,
       -0.9405131 , -0.92217183,  1.4494176 ,  0.8063312 , -1.9848628 ,
        1.4634243 , -4.440749  , -2.036384  ,  1.9695312 ,  2.1328359 ,
       -0.06996727,  1.0586741 ,  0.7214236 , -2.455808  ,  0.6979034 ,
       -3.9795585 ,  5.9907084 ,  0.46196818, -1.6138005 ,  6.1429152 ,
       -0.6601535 ,  0.05084187, -1.2918818 , -0.60123396,  0.526967  ,
       -3.8541064 ,  1.5263546 ,  0.6027452 , -0.57671547,  0.9018628 ,
        1.1323965 , -0.22235966,  0.380499  , -0.29168043,  2.230485  ,
       -2.0556371 ,  4.193283  ,  4.0678716 , -1.9232147 ,  1.2723017 ,
        8.359108  , -1.0821432 ,  3.8314054 , -1.5646372 ,  2.12

In [37]:
# The L2 norm of the token's vector
doc[2].vector_norm

23.784721

# Pipeline Components

#### Pipeline information

In [38]:
nlp = spacy.load("en_core_web_sm")

nlp.pipe_names

['tagger', 'parser', 'ner']

In [39]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f655aca98d0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f6526a32948>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f6526a329a8>)]

#### Custom Components
Components can be added ```first```, ```last``` (default), or ```before``` or ```after``` an existing component.

In [40]:
# Function that modifies the doc and returns it
def custom_component(doc):
    print("Do something to the doc here!")
    return doc

nlp.add_pipe(custom_component, first=True)

In [41]:
nlp.pipeline

[('custom_component', <function __main__.custom_component(doc)>),
 ('tagger', <spacy.pipeline.pipes.Tagger at 0x7f655aca98d0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f6526a32948>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f6526a329a8>)]

#### Add component after 'parser' (Dependency)

In [42]:
nlp = spacy.load("en_core_web_sm")

nlp.add_pipe(custom_component, after='parser')

In [43]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f6526c69cc0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f6526e39528>),
 ('custom_component', <function __main__.custom_component(doc)>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f6526e39588>)]

# Extension Attributes
Custom attributes that are registered on the global ```Doc```, ```Token``` and ```Span``` classes and become available as ```._```



In [44]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("The sky over New York is blue")

In [45]:
[token.text for token in doc]

['The', 'sky', 'over', 'New', 'York', 'is', 'blue']

In [46]:
spacy.tokens.Token.set_extension('is_color', default=False, force=True)

In [47]:
[token.text for token in doc]

['The', 'sky', 'over', 'New', 'York', 'is', 'blue']

In [48]:
doc[6]._.is_color=True

In [49]:
[token.text for token in doc]

['The', 'sky', 'over', 'New', 'York', 'is', 'blue']

#### Property extensions (with getter & setter)

In [50]:
# Register custom attribute on Doc class
get_reversed = lambda doc: doc.text[::-1]
spacy.tokens.Doc.set_extension("reversed", getter=get_reversed)
# Compute value of extension attribute with getter
doc._.reversed
# 'eulb si kroY weN revo yks ehT'

'eulb si kroY weN revo yks ehT'

# Method extensions (callable method)

In [51]:
# Register custom attribute on Span class
has_label = lambda span, label: span.label_ == label
spacy.tokens.Span.set_extension("has_label", method=has_label, force=True)
# Compute value of extension attribute with method
doc[3:5].has_label("GPE")
# True

AttributeError: 'spacy.tokens.span.Span' object has no attribute 'has_label'

# Rule-based matching

In [52]:
doc = nlp("The sky over New York is blue")

matcher = spacy.matcher.Matcher(nlp.vocab)

# Add with ID, optional callback and pattern(s)
pattern = [{"LOWER": "new"}, {"LOWER": "york"}]
matcher.add('CITIES', None, pattern)


In [53]:
doc = nlp("I live in New York")

# Match by calling the matcher on a Doc object
matches = matcher(doc)
# Matches are (match_id, start, end) tuples
for match_id, start, end in matches:
     # Get the matched span by slicing the Doc
     span = doc[start:end]
     print(span.text)
# 'New York'

New York
