In [None]:
import spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")
# Process whole documents
text = ("""Since you're methodical and focused on scalable NLP pipelines,
        spaCy might be your best ally for production-ready tasks.
        But if you're exploring linguistic theory
        or building custom tokenizers, NLTK still has its charm.""")
doc = nlp(text)

In [None]:
# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['you', 'scalable NLP pipelines', 'your best ally', 'production-ready tasks', 'you', 'linguistic theory', 'custom tokenizers', 'NLTK', 'its charm']
Verbs: ['focus', 'explore', 'build', 'have']
NLP ORG
NLTK ORG


# practice

In [None]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
# checking pipe lines
doc = nlp(text)
for token in doc:
  print(token, " | ", token.pos_, " | ", token.lemma_)

Since  |  SCONJ  |  since
you  |  PRON  |  you
're  |  AUX  |  be
methodical  |  ADJ  |  methodical
and  |  CCONJ  |  and
focused  |  VERB  |  focus
on  |  ADP  |  on
scalable  |  ADJ  |  scalable
NLP  |  PROPN  |  NLP
pipelines  |  NOUN  |  pipeline
,  |  PUNCT  |  ,

          |  SPACE  |  
        
spaCy  |  NUM  |  spacy
might  |  AUX  |  might
be  |  AUX  |  be
your  |  PRON  |  your
best  |  ADJ  |  good
ally  |  NOUN  |  ally
for  |  ADP  |  for
production  |  NOUN  |  production
-  |  PUNCT  |  -
ready  |  ADJ  |  ready
tasks  |  NOUN  |  task
.  |  PUNCT  |  .

          |  SPACE  |  
        
But  |  CCONJ  |  but
if  |  SCONJ  |  if
you  |  PRON  |  you
're  |  AUX  |  be
exploring  |  VERB  |  explore
linguistic  |  ADJ  |  linguistic
theory  |  NOUN  |  theory

          |  SPACE  |  
        
or  |  CCONJ  |  or
building  |  VERB  |  build
custom  |  NOUN  |  custom
tokenizers  |  NOUN  |  tokenizer
,  |  PUNCT  |  ,
NLTK  |  PROPN  |  NLTK
still  |  ADV  |  still
has  | 

In [None]:
# explaning this PUNCT
spacy.explain("PUNCT")    , spacy.explain("SCONJ") , spacy.explain("ADP"), spacy.explain("DET"), spacy.explain("NUM")

('punctuation',
 'subordinating conjunction',
 'adposition',
 'determiner',
 'numeral')

In [None]:
# NER named entity recognition
for ent in doc.ents:
  print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

NLP  |  ORG  |  Companies, agencies, institutions, etc.
NLTK  |  ORG  |  Companies, agencies, institutions, etc.


In [None]:
# doing above cell in beautiful way
from spacy import displacy
displacy.render(doc, style='ent')

In [None]:
# creating the blank pipe line

nlp=spacy.blank("en")
doc=nlp.make_doc(text)
text="Hey! my name is asad. I am going to Lahore to buy 34 laptop."
# for token in doc:
#   print(token, " | ", token.pos_, " | ", token.lemma_)
nlp.pipe_names

[]

In [None]:
# creating own pipe line
source_nlp= spacy.load("en_core_web_sm")
nlp=spacy.blank("en")
nlp.add_pipe("ner", source=source_nlp)
nlp.pipe_names

['ner']

In [None]:
doc=nlp("Hey! my name is asad. I am going to Lahore to buy 34 laptop.")
# text="Hey! my name is asad. I am going to Lahore to buy 34 laptop."
for ent in doc.ents:
  print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Lahore  |  GPE  |  Countries, cities, states
34  |  CARDINAL  |  Numerals that do not fall under another type


ADDING CUSTOM ATTRIBUTES

In [2]:
import spacy
nlp=spacy.load("en_core_web_sm")
doc=nlp("ability,popularity,bro,brah,similiarity,booking,laghing,eats, ate,having.")
for token in doc:
  print(token, " | ",token.lemma_)

ability  |  ability
,  |  ,
popularity  |  popularity
,  |  ,
bro  |  bro
,  |  ,
brah  |  brah
,  |  ,
similiarity  |  similiarity
,  |  ,
booking  |  booking
,  |  ,
laghing  |  laghing
,  |  ,
eats  |  eat
,  |  ,
ate  |  eat
,  |  ,
having  |  have
.  |  .


In [3]:
import nltk
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()
doc=nlp("ability,popularity,bro,brah,similiarity,booking,laghing,eats, ate,having.")
for token in doc:
  print(token, " | ",stemmer.stem(token.text))


ability  |  abil
,  |  ,
popularity  |  popular
,  |  ,
bro  |  bro
,  |  ,
brah  |  brah
,  |  ,
similiarity  |  similiar
,  |  ,
booking  |  book
,  |  ,
laghing  |  lagh
,  |  ,
eats  |  eat
,  |  ,
ate  |  ate
,  |  ,
having  |  have
.  |  .


In [9]:
at=nlp.get_pipe("attribute_ruler")
at.add([[{"TEXT":"bro"}],[{"TEXT":"brah"}]],{"LEMMA":"brother"})
doc=nlp("ability,popularity,bro,brah,similiarity,booking,laghing,eats, ate,having.")
for token in doc:
  print(token, " | ",token.lemma_)

ability  |  ability
,  |  ,
popularity  |  popularity
,  |  ,
bro  |  brother
,  |  ,
brah  |  brother
,  |  ,
similiarity  |  similiarity
,  |  ,
booking  |  booking
,  |  ,
laghing  |  laghing
,  |  ,
eats  |  eat
,  |  ,
ate  |  eat
,  |  ,
having  |  have
.  |  .
