In [1]:
pip install spacy



In [2]:
import spacy

In [3]:
#pipeline
nlp = spacy.load('en_core_web_sm')

In [35]:
sentence = "Will he crack the interview? Let's hope for the best."

In [36]:
doc = nlp(sentence)

In [37]:
doc

Will he crack the interview? Let's hope for the best.

In [38]:
doc.text

"Will he crack the interview? Let's hope for the best."

In [39]:
doc[-1]

.

In [40]:
len(doc)

13

In [41]:
doc[-1].pos_

'PUNCT'

In [42]:
spacy.explain('PUNCT')

'punctuation'

In [43]:
for token in doc:
  print(token.text, "------", token.pos_,"------", token.tag_,"------", spacy.explain(token.tag_))

Will ------ AUX ------ MD ------ verb, modal auxiliary
he ------ PRON ------ PRP ------ pronoun, personal
crack ------ VERB ------ VB ------ verb, base form
the ------ DET ------ DT ------ determiner
interview ------ NOUN ------ NN ------ noun, singular or mass
? ------ PUNCT ------ . ------ punctuation mark, sentence closer
Let ------ VERB ------ VB ------ verb, base form
's ------ PRON ------ PRP ------ pronoun, personal
hope ------ VERB ------ VB ------ verb, base form
for ------ ADP ------ IN ------ conjunction, subordinating or preposition
the ------ DET ------ DT ------ determiner
best ------ ADJ ------ JJS ------ adjective, superlative
. ------ PUNCT ------ . ------ punctuation mark, sentence closer


You can check https://v2.spacy.io/api/annotation for the complete list of pos categories in spacy.

https://en.wikipedia.org/wiki/Preposition_and_postposition

https://en.wikipedia.org/wiki/Part_of_speech

In [44]:
for sentence in doc.sents:
  print(sentence)

Will he crack the interview?
Let's hope for the best.


# Spacy vs NLTK

In [45]:
#nltk

from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [56]:
nltk_sents = sent_tokenize("She is Dr. Radha. She is a Neurosurgeon. She has her own reputed hospital.")

In [61]:
print(nltk_sents)

['She is Dr. Radha.', 'She is a Neurosurgeon.', 'She has her own reputed hospital.']


In [58]:
from nltk.tokenize import word_tokenize
nltk_tokens = word_tokenize("She is Dr. Radha. She is a Neurosurgeon. She has her own reputed hospital.")

In [59]:
nltk_tokens

['She',
 'is',
 'Dr.',
 'Radha',
 '.',
 'She',
 'is',
 'a',
 'Neurosurgeon',
 '.',
 'She',
 'has',
 'her',
 'own',
 'reputed',
 'hospital',
 '.']

In [100]:
#spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("She is Dr. Radha. She is a Neurosurgeon. She has her own reputed hospital.")

In [62]:
for sentence in doc.sents:
    print(sentence)

She is Dr. Radha.
She is a Neurosurgeon.
She has her own reputed hospital.


In [63]:
spacy_sents = doc.sents

In [64]:
spacy_sents

<generator at 0x7bfa44a3c720>

In [65]:
for sentence in doc.sents:
    for word in sentence:
        print(word)

She
is
Dr.
Radha
.
She
is
a
Neurosurgeon
.
She
has
her
own
reputed
hospital
.



From above code we can see that Spacy is object oriented whereas NLTK is a string processing library

In [32]:
doc2 = nlp("The quick brown fox jumped over the lazy dog")

In [33]:
from spacy import displacy
displacy.render(doc2,style='dep')

In [66]:
type(nlp)

In [68]:
doc2

The quick brown fox jumped over the lazy dog

In [69]:
type(doc)

spacy.tokens.doc.Doc

In [73]:
word = doc2[1]
word

quick

In [74]:
type(word)

spacy.tokens.token.Token

In [77]:
substring = doc2[:3]
substring

The quick brown

In [78]:
type(substring)

spacy.tokens.span.Span

In [79]:
doc3 = nlp("She ate two chocaltes of $1.")

In [80]:
doc3[0]

She

In [83]:
for token in doc3:
  print("Token: ", token, "\nis_alpha:", token.is_alpha, "like_num:", token.like_num, "is_currency:", token.is_currency, "is_punct:", token.is_punct, )

Token:  She 
is_alpha: True like_num: False is_currency: False is_punct: False
Token:  ate 
is_alpha: True like_num: False is_currency: False is_punct: False
Token:  two 
is_alpha: True like_num: True is_currency: False is_punct: False
Token:  chocaltes 
is_alpha: True like_num: False is_currency: False is_punct: False
Token:  of 
is_alpha: True like_num: False is_currency: False is_punct: False
Token:  $ 
is_alpha: False like_num: False is_currency: True is_punct: False
Token:  1 
is_alpha: False like_num: True is_currency: False is_punct: False
Token:  . 
is_alpha: False like_num: False is_currency: False is_punct: True


In [89]:
doc4 = nlp("Her name is Chandu. Her mobile number is 1234567890. Her email address is chandu@williams.com. Her website is www.creativity.com. She has 2 social media profiles.")
doc4

Her name is Chandu. Her mobile number is 1234567890. Her email address is chandu@williams.com. Her website is www.creativity.com. She has 2 social media profiles.

In [96]:
email = []
number = []
url = []
for token in doc4:
  if token.like_email:
    email.append(token.text)
  if token.like_num:
    number.append(token.text)
  if token.like_url:
    url.append(token.text)

In [97]:
email, number, url

(['chandu@williams.com'], ['1234567890', '2'], ['www.creativity.com'])

In [99]:
#spacy supports other languages too
hindi_nlp = spacy.blank("hi") #hindi
doc = hindi_nlp("भैया जी! 5000 ₹ उधार थे वो वापस देदो")
for token in doc:
    print(token, token.is_currency)

भैया False
जी False
! False
5000 False
₹ True
उधार False
थे False
वो False
वापस False
देदो False


In [102]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc = nlp(transactions)
#printing money transactions
for token in doc:
  if token.like_num and doc[token.i+1].is_currency: #token.i is used as index
    print(token.text, doc[token.i+1].text)


two $
500 €


In [108]:
token = doc[5]
token

Peter

In [110]:
#returns index of the token
token.i

5

In [75]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [111]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7bfa2f0ec6a0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7bfa2f0ee200>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7bfa2f2bf920>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7bfa2ee32140>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7bfa2ee31d00>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7bfa2f2bf7d0>)]


```
spacy.load('en_core_web_sm')
```

has the above pipeline in it, whereas

```
spacy.blank('en')
```

 has an empty pipeline...

 sm in en_core_web_sm means small. There are other models available as well such as medium, large etc. en means english...

  Check this: https://spacy.io/usage/models#quickstart

In [76]:
blank_nlp = spacy.blank('en')
blank_nlp.pipe_names  #empty

[]

In [113]:
blank_nlp.pipeline

[]

In [114]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token, " | ", spacy.explain(token.pos_), " | ", token.lemma_)

Captain  |  proper noun  |  Captain
america  |  proper noun  |  america
ate  |  verb  |  eat
100  |  numeral  |  100
$  |  numeral  |  $
of  |  adposition  |  of
samosa  |  proper noun  |  samosa
.  |  punctuation  |  .
Then  |  adverb  |  then
he  |  pronoun  |  he
said  |  verb  |  say
I  |  pronoun  |  I
can  |  auxiliary  |  can
do  |  verb  |  do
this  |  pronoun  |  this
all  |  determiner  |  all
day  |  noun  |  day
.  |  punctuation  |  .


In [115]:
doc = blank_nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token, " | ", spacy.explain(token.pos_), " | ", token.lemma_)

Captain  |  None  |  
america  |  None  |  
ate  |  None  |  
100  |  None  |  
$  |  None  |  
of  |  None  |  
samosa  |  None  |  
.  |  None  |  
Then  |  None  |  
he  |  None  |  
said  |  None  |  
I  |  None  |  
can  |  None  |  
do  |  None  |  
this  |  None  |  
all  |  None  |  
day  |  None  |  
.  |  None  |  




it's all none, because blank pipeline has no elements of pipeline in it to do pos tagging, or lemmatize the word.. it can only perform tokenization.

# NER

In [148]:
#Named Entity Recognition
ner_doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for entity in ner_doc.ents:
  print(entity.text, entity.label_, "......", spacy.explain(entity.label_))

Tesla Inc ORG ...... Companies, agencies, institutions, etc.
$45 billion MONEY ...... Monetary values, including unit


In [118]:
displacy.render(ner_doc, style="ent")

In [149]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

List of entities are also documented on this page: https://spacy.io/models/en



# Setting custom Entities

In [150]:
doc = nlp("Tesla is going to acquire Twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_)

Tesla  |  ORG
Twitter  |  PRODUCT
$45 billion  |  MONEY


In [151]:
s = doc[2:5]
s

going to acquire

In [152]:
type(s)

spacy.tokens.span.Span

In [153]:
from spacy.tokens import Span

s1 = Span(doc, 0, 1, label="ORG")
s2 = Span(doc, 5, 6, label="ORG")

doc.set_ents([s1, s2], default="unmodified")

In [154]:
for ent in doc.ents:
    print(ent.text, " | ", ent.label_)

Tesla  |  ORG
Twitter  |  ORG
$45 billion  |  MONEY


We can add components to the blank pipeline

In [120]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [122]:
#adding 'ner' component to the blank pipeline
source_nlp = spacy.load("en_core_web_sm")
blank_nlp = spacy.blank("en")
blank_nlp.add_pipe("ner", source=source_nlp)
blank_nlp.pipe_names

['ner']

# Customizing lemmatizer

In [124]:
doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | bro
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brah
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust


In [123]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [125]:
#lemmatizing: brah, bro as brother
ar = nlp.get_pipe('attribute_ruler')
ar.add([[{"TEXT": "Bro"}],[{"TEXT":"Brah"}]],{"LEMMA":"Brother"})

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust


# POS

In [126]:
#data cleaning
earnings_text="""Microsoft Corp. today announced the following results for the quarter ended December 31, 2021, as compared to the corresponding period of last fiscal year:

·         Revenue was $51.7 billion and increased 20%
·         Operating income was $22.2 billion and increased 24%
·         Net income was $18.8 billion and increased 21%
·         Diluted earnings per share was $2.48 and increased 22%
“Digital technology is the most malleable resource at the world’s disposal to overcome constraints and reimagine everyday work and life,” said Satya Nadella, chairman and chief executive officer of Microsoft. “As tech as a percentage of global GDP continues to increase, we are innovating and investing across diverse and growing markets, with a common underlying technology stack and an operating model that reinforces a common strategy, culture, and sense of purpose.”
“Solid commercial execution, represented by strong bookings growth driven by long-term Azure commitments, increased Microsoft Cloud revenue to $22.1 billion, up 32% year over year” said Amy Hood, executive vice president and chief financial officer of Microsoft."""

doc = nlp(earnings_text)

filtered_tokens = []

for token in doc:
    if token.pos_ not in ["SPACE", "PUNCT", "X"]:
        filtered_tokens.append(token)

In [128]:
filtered_tokens[:5]

[Microsoft, Corp., today, announced, the]

In [134]:
spacy.attrs.POS

74

In [133]:
count = doc.count_by(spacy.attrs.POS)
count

{96: 13,
 92: 46,
 100: 24,
 90: 9,
 85: 16,
 93: 16,
 97: 27,
 98: 1,
 84: 20,
 103: 10,
 87: 6,
 99: 5,
 89: 12,
 86: 3,
 94: 3,
 95: 2}

In [140]:
doc.vocab[92].text

'NOUN'

In [141]:
count.items()

dict_items([(96, 13), (92, 46), (100, 24), (90, 9), (85, 16), (93, 16), (97, 27), (98, 1), (84, 20), (103, 10), (87, 6), (99, 5), (89, 12), (86, 3), (94, 3), (95, 2)])

In [143]:
for k, v in count.items():
  print(doc.vocab[k].text, "......", v)

PROPN ...... 13
NOUN ...... 46
VERB ...... 24
DET ...... 9
ADP ...... 16
NUM ...... 16
PUNCT ...... 27
SCONJ ...... 1
ADJ ...... 20
SPACE ...... 10
AUX ...... 6
SYM ...... 5
CCONJ ...... 12
ADV ...... 3
PART ...... 3
PRON ...... 2


In [144]:
doc.vocab[95]

<spacy.lexeme.Lexeme at 0x7bfa2cc91180>

In [145]:
doc.vocab[95].text

'PRON'

# Stopwords

In [155]:
from spacy.lang.en.stop_words import STOP_WORDS
len(STOP_WORDS)

326

In [156]:
doc = nlp("Hi everyone. This is a bot. Hope you all are doing well!")

for token in doc:
  if token.is_stop:
    print(token)

everyone
This
is
a
you
all
are
doing
well


In [158]:
for token in doc:
  if not token.is_stop:
    print(token)

Hi
.
bot
.
Hope
!


As we can see, removing stop words might decrease the meaning of the sentences and will be problematic in some cases, like in sentiment analysis, language translation,chatbots, Q&A systems, etc...

Ex: sentences- 'this is a good movie'

'this is not a good movie',

both of these sentences will return

'good movie'

after removing stopwords.

# Customizing stopwords

In [159]:
#use this pre-processing function to pass the text and to remove all the stop words and finally get the cleaned form
def preprocess(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop_words)


#remove the stopword 'not' in spacy
nlp.vocab['not'].is_stop = False


#send the two texts given above into the pre-process function and store the transformed texts
positive_text = preprocess('this is a good movie')
negative_text = preprocess('this is not a good movie')


#finally print those 2 transformed texts
print(f"Text1: {positive_text}")
print(f"Text2: {negative_text}")

Text1: good movie
Text2: not good movie
