In [1]:
! pip install spacy



In [6]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 12.9 MB/s eta 0:00:01    |█████████████████████▎          | 9.1 MB 12.9 MB/s eta 0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [8]:
text = """The Election Commission on Friday debarred Assam Minister and BJP leader Himanta Biswa Sarma from campaigning for ongoing Assam elections for 48 hours with effect from today. This came after Sarma allegedly made threatening remarks against opposition leader Hagrama Mohilary of the Bodoland People's Front. EC had on Thursday asked Sarma for an explanation by today over his remarks."""
doc = nlp(text)
doc

The Election Commission on Friday debarred Assam Minister and BJP leader Himanta Biswa Sarma from campaigning for ongoing Assam elections for 48 hours with effect from today. This came after Sarma allegedly made threatening remarks against opposition leader Hagrama Mohilary of the Bodoland People's Front. EC had on Thursday asked Sarma for an explanation by today over his remarks.

In [10]:
# Sentence tokenization
sent_tokenize = list(doc.sents)
sent_tokenize

[The Election Commission on Friday debarred Assam Minister and BJP leader Himanta Biswa Sarma from campaigning for ongoing Assam elections for 48 hours with effect from today.,
 This came after Sarma allegedly made threatening remarks against opposition leader Hagrama Mohilary of the Bodoland People's Front.,
 EC had on Thursday asked Sarma for an explanation by today over his remarks.]

In [11]:
len(doc)

63

In [12]:
# Word tokenization
for i in doc:
    print(i)

The
Election
Commission
on
Friday
debarred
Assam
Minister
and
BJP
leader
Himanta
Biswa
Sarma
from
campaigning
for
ongoing
Assam
elections
for
48
hours
with
effect
from
today
.
This
came
after
Sarma
allegedly
made
threatening
remarks
against
opposition
leader
Hagrama
Mohilary
of
the
Bodoland
People
's
Front
.
EC
had
on
Thursday
asked
Sarma
for
an
explanation
by
today
over
his
remarks
.


In [13]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS
stopwords

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [21]:
for token in doc:
    if token.text.lower() not in stopwords:
        print(str(token))

Election
Commission
Friday
debarred
Assam
Minister
BJP
leader
Himanta
Biswa
Sarma
campaigning
ongoing
Assam
elections
48
hours
effect
today
.
came
Sarma
allegedly
threatening
remarks
opposition
leader
Hagrama
Mohilary
Bodoland
People
.
EC
Thursday
asked
Sarma
explanation
today
remarks
.


In [22]:
# Punctuation removing

for token in doc:
    if not token.is_punct:
        print(token)

The
Election
Commission
on
Friday
debarred
Assam
Minister
and
BJP
leader
Himanta
Biswa
Sarma
from
campaigning
for
ongoing
Assam
elections
for
48
hours
with
effect
from
today
This
came
after
Sarma
allegedly
made
threatening
remarks
against
opposition
leader
Hagrama
Mohilary
of
the
Bodoland
People
's
Front
EC
had
on
Thursday
asked
Sarma
for
an
explanation
by
today
over
his
remarks


In [28]:
nlp = spacy.load('en_core_web_sm')
def preprocessing(data):
    doc = nlp(data)
    lst=[]
    for token in doc:
        if token.text.lower() not in stopwords:
            if not token.is_punct:
                lst.append(str(token))
    return " ".join(lst)
text = """The Election Commission on Friday debarred Assam Minister and BJP leader Himanta Biswa Sarma from campaigning for ongoing Assam elections for 48 hours with effect from today. This came after Sarma allegedly made threatening remarks against opposition leader Hagrama Mohilary of the Bodoland People's Front. EC had on Thursday asked Sarma for an explanation by today over his remarks."""
ans = preprocessing(text)
print(ans)

Election Commission Friday debarred Assam Minister BJP leader Himanta Biswa Sarma campaigning ongoing Assam elections 48 hours effect today came Sarma allegedly threatening remarks opposition leader Hagrama Mohilary Bodoland People EC Thursday asked Sarma explanation today remarks


In [29]:
# Lemmatization

for token in doc:
    print(f'{token} --> {token.lemma_}')

The --> the
Election --> Election
Commission --> Commission
on --> on
Friday --> Friday
debarred --> debar
Assam --> Assam
Minister --> Minister
and --> and
BJP --> BJP
leader --> leader
Himanta --> Himanta
Biswa --> Biswa
Sarma --> Sarma
from --> from
campaigning --> campaign
for --> for
ongoing --> ongoing
Assam --> Assam
elections --> election
for --> for
48 --> 48
hours --> hour
with --> with
effect --> effect
from --> from
today --> today
. --> .
This --> this
came --> come
after --> after
Sarma --> Sarma
allegedly --> allegedly
made --> make
threatening --> threatening
remarks --> remark
against --> against
opposition --> opposition
leader --> leader
Hagrama --> Hagrama
Mohilary --> Mohilary
of --> of
the --> the
Bodoland --> Bodoland
People --> People
's --> 's
Front --> Front
. --> .
EC --> EC
had --> have
on --> on
Thursday --> Thursday
asked --> ask
Sarma --> Sarma
for --> for
an --> an
explanation --> explanation
by --> by
today --> today
over --> over
his --> his
remarks --

In [30]:

for token in doc:
    print(f'{token} --> {token.lemma_} --> {token.pos_} --> {spacy.explain(token.tag_)}')

The --> the --> DET --> determiner
Election --> Election --> PROPN --> noun, proper singular
Commission --> Commission --> PROPN --> noun, proper singular
on --> on --> ADP --> conjunction, subordinating or preposition
Friday --> Friday --> PROPN --> noun, proper singular
debarred --> debar --> VERB --> verb, past tense
Assam --> Assam --> PROPN --> noun, proper singular
Minister --> Minister --> PROPN --> noun, proper singular
and --> and --> CCONJ --> conjunction, coordinating
BJP --> BJP --> PROPN --> noun, proper singular
leader --> leader --> NOUN --> noun, singular or mass
Himanta --> Himanta --> PROPN --> noun, proper singular
Biswa --> Biswa --> PROPN --> noun, proper singular
Sarma --> Sarma --> PROPN --> noun, proper singular
from --> from --> ADP --> conjunction, subordinating or preposition
campaigning --> campaign --> VERB --> verb, gerund or present participle
for --> for --> ADP --> conjunction, subordinating or preposition
ongoing --> ongoing --> ADJ --> adjective
Assam

In [31]:
doc.ents

(The Election Commission,
 Friday,
 BJP,
 Himanta Biswa Sarma,
 48 hours,
 today,
 Sarma,
 Hagrama Mohilary,
 the Bodoland People's Front,
 EC,
 Thursday,
 Sarma,
 today)

In [32]:
for token in doc.ents:
    print(f'{token} --> {token.label_}')

The Election Commission --> ORG
Friday --> DATE
BJP --> ORG
Himanta Biswa Sarma --> PERSON
48 hours --> TIME
today --> DATE
Sarma --> PERSON
Hagrama Mohilary --> PERSON
the Bodoland People's Front --> ORG
EC --> ORG
Thursday --> DATE
Sarma --> PERSON
today --> DATE


In [34]:
from spacy import displacy

displacy.render(doc,style='ent')

In [35]:
displacy.render(doc,style='dep')

In [36]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0/en_core_web_md-3.0.0-py3-none-any.whl (47.1 MB)
[K     |████████████████████████████████| 47.1 MB 5.2 MB/s eta 0:00:011    |█████████████▌                  | 19.9 MB 5.6 MB/s eta 0:00:05     |███████████████▍                | 22.7 MB 15.8 MB/s eta 0:00:02     |██████████████████████▋         | 33.2 MB 1.5 MB/s eta 0:00:09
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [37]:
nlp = spacy.load('en_core_web_md')
doc = nlp(text)
for token in doc:
    print(f'{token} ---> {token.has_vector}')

The ---> True
Election ---> True
Commission ---> True
on ---> True
Friday ---> True
debarred ---> True
Assam ---> True
Minister ---> True
and ---> True
BJP ---> True
leader ---> True
Himanta ---> False
Biswa ---> False
Sarma ---> True
from ---> True
campaigning ---> True
for ---> True
ongoing ---> True
Assam ---> True
elections ---> True
for ---> True
48 ---> True
hours ---> True
with ---> True
effect ---> True
from ---> True
today ---> True
. ---> True
This ---> True
came ---> True
after ---> True
Sarma ---> True
allegedly ---> True
made ---> True
threatening ---> True
remarks ---> True
against ---> True
opposition ---> True
leader ---> True
Hagrama ---> False
Mohilary ---> False
of ---> True
the ---> True
Bodoland ---> False
People ---> True
's ---> True
Front ---> True
. ---> True
EC ---> True
had ---> True
on ---> True
Thursday ---> True
asked ---> True
Sarma ---> True
for ---> True
an ---> True
explanation ---> True
by ---> True
today ---> True
over ---> True
his ---> True
remarks

In [38]:
nlp = spacy.load('en_core_web_md')
doc = nlp(text)
for token in doc:
    print(f'{token} ---> {token.vector}')

The ---> [ 2.7204e-01 -6.2030e-02 -1.8840e-01  2.3225e-02 -1.8158e-02  6.7192e-03
 -1.3877e-01  1.7708e-01  1.7709e-01  2.5882e+00 -3.5179e-01 -1.7312e-01
  4.3285e-01 -1.0708e-01  1.5006e-01 -1.9982e-01 -1.9093e-01  1.1871e+00
 -1.6207e-01 -2.3538e-01  3.6640e-03 -1.9156e-01 -8.5662e-02  3.9199e-02
 -6.6449e-02 -4.2090e-02 -1.9122e-01  1.1679e-02 -3.7138e-01  2.1886e-01
  1.1423e-03  4.3190e-01 -1.4205e-01  3.8059e-01  3.0654e-01  2.0167e-02
 -1.8316e-01 -6.5186e-03 -8.0549e-03 -1.2063e-01  2.7507e-02  2.9839e-01
 -2.2896e-01 -2.2882e-01  1.4671e-01 -7.6301e-02 -1.2680e-01 -6.6651e-03
 -5.2795e-02  1.4258e-01  1.5610e-01  5.5510e-02 -1.6149e-01  9.6290e-02
 -7.6533e-02 -4.9971e-02 -1.0195e-02 -4.7641e-02 -1.6679e-01 -2.3940e-01
  5.0141e-03 -4.9175e-02  1.3338e-02  4.1923e-01 -1.0104e-01  1.5111e-02
 -7.7706e-02 -1.3471e-01  1.1900e-01  1.0802e-01  2.1061e-01 -5.1904e-02
  1.8527e-01  1.7856e-01  4.1293e-02 -1.4385e-02 -8.2567e-02 -3.5483e-02
 -7.6173e-02 -4.5367e-02  8.9281e-02  3.36

In [46]:
# Similarity bet words

text1='zebra'
text2='lion'

doc1=nlp(text1)
doc2=nlp(text2)

doc1.similarity(doc2)

0.7123956359830604