In [19]:
#Blank NLP pipeline
import spacy
nlp=spacy.blank("en")
doc=nlp("Captain america ate 100$ of samosa.Then he said I can do this all day")

for token in doc:
  print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day


In [20]:
#In spacy pipeline starts with the tokenizer component(it is present default even in blank pipeline)
nlp.pipe_names

[]

In [21]:
#nlp.pipe_names is empty array indicating no components in the pipeline. Pipeline is something that starts with a tokenizer
#We have to downnload trained pipeline
#python -m spacy download en_core_web_sm

#This downloads the small (sm) pipeline for english language
nlp=spacy.load("en_core_web_sm")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [22]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7d3f62324be0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7d3f6073c100>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7d3f622bcac0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7d3f60d78480>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7d3f60ed3040>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7d3f622bcba0>)]

In [23]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token, " | ", spacy.explain(token.pos_), " | ", token.lemma_)

Captain  |  proper noun  |  Captain
america  |  proper noun  |  america
ate  |  verb  |  eat
100  |  numeral  |  100
$  |  numeral  |  $
of  |  adposition  |  of
samosa  |  proper noun  |  samosa
.  |  punctuation  |  .
Then  |  adverb  |  then
he  |  pronoun  |  he
said  |  verb  |  say
I  |  pronoun  |  I
can  |  auxiliary  |  can
do  |  verb  |  do
this  |  pronoun  |  this
all  |  determiner  |  all
day  |  noun  |  day
.  |  punctuation  |  .


In [24]:
#Lets try Named Entity Recognition
doc=nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
  print(ent.text,ent.label_)

Tesla Inc ORG
$45 billion MONEY


In [25]:
from spacy import displacy
displacy.render(doc,style="ent")

In [26]:
#Lets try on different language
#Lets go with French Language
#You need to install the processing pipeline for french language using this command
#python -m spacy download fr_core_news_sm

In [27]:
!python -m spacy download fr_core_news_sm

Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [28]:
nlp = spacy.load("fr_core_news_sm")

In [29]:
doc = nlp("Tesla Inc va racheter Twitter pour $45 milliards de dollars")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter  |  MISC  |  Miscellaneous entities, e.g. events, nationalities, products or works of art


In [30]:
for token in doc:
    print(token, " | ", token.pos_, " | ", token.lemma_)

Tesla  |  PROPN  |  Tesla
Inc  |  PROPN  |  Inc
va  |  VERB  |  aller
racheter  |  VERB  |  racheter
Twitter  |  VERB  |  twitter
pour  |  ADP  |  pour
$  |  NOUN  |  dollar
45  |  NUM  |  45
milliards  |  NOUN  |  milliard
de  |  ADP  |  de
dollars  |  NOUN  |  dollar


In [32]:
#Adding component to a blank pipeline
source_nlp=spacy.load("en_core_web_sm")
nlp=spacy.blank("en")
nlp.add_pipe("ner",source=source_nlp)
nlp.pipe_names

['ner']

In [33]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, ent.label_)

Tesla Inc ORG
$45 billion MONEY
