In [14]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [26]:
import spacy

from spacy import displacy
from spacy.lang.en import English
from spacy.tokens.doc import Doc

## Blank Pipeline

In [16]:
nlp: English = spacy.blank("en")
print(nlp.pipe_names, end="\n\n")

doc: Doc = nlp("Captain America ate 100$ of Samosa. Then he said I can do this all day.")
for token in doc:
    print(token.text)

[]

Captain
America
ate
100
$
of
Samosa
.
Then
he
said
I
can
do
this
all
day
.


## Pretrained Pipeline

In [19]:
nlp = spacy.load("en_core_web_sm")
display(nlp.pipeline)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fadc0412990>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fae448dcbf0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fae44504200>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fae38b69d90>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fae38b6ac10>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fae200ff3e0>)]

In [21]:
doc = nlp("Captain America ate 100$ of Samosa. Then he said I can do this all day.")

for token in doc:
    print(f"{token.text} | {token.pos_} | {token.lemma_}")

Captain | PROPN | Captain
America | PROPN | America
ate | VERB | eat
100 | NUM | 100
$ | NUM | $
of | ADP | of
Samosa | PROPN | Samosa
. | PUNCT | .
Then | ADV | then
he | PRON | he
said | VERB | say
I | PRON | I
can | AUX | can
do | VERB | do
this | PRON | this
all | DET | all
day | NOUN | day
. | PUNCT | .


In [25]:
doc = nlp("Tesla Inc. is going to acquire Twitter for $45 billion.")

for ent in doc.ents:
    print(f"{ent.text} | {ent.label_} | {spacy.explain(ent.label_)}")

Tesla Inc. | ORG | Companies, agencies, institutions, etc.
Twitter | PRODUCT | Objects, vehicles, foods, etc. (not services)
$45 billion | MONEY | Monetary values, including unit


In [28]:
doc = nlp("Bloomberg founded a data company called Bloomberg.")

displacy.render(doc, style="ent")

## Custom Pipeline

In [29]:
source_nlp: English = spacy.load("en_core_web_sm")
nlp = spacy.blank("en")

nlp.add_pipe("ner", source=source_nlp)
print(nlp.pipe_names)

['ner']


In [30]:
doc = nlp("Tesla Inc. is going to acquire Twitter for $45 billion.")

for ent in doc.ents:
    print(f"{ent.text} | {ent.label_} | {spacy.explain(ent.label_)}")

Tesla Inc. | ORG | Companies, agencies, institutions, etc.
Twitter | PRODUCT | Objects, vehicles, foods, etc. (not services)
$45 billion | MONEY | Monetary values, including unit
