In [1]:
! pip install spacy



In [6]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 12.9 MB/s eta 0:00:01    |█████████████████████▎          | 9.1 MB 12.9 MB/s eta 0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [8]:
text = """The Election Commission on Friday debarred Assam Minister and BJP leader Himanta Biswa Sarma from campaigning for ongoing Assam elections for 48 hours with effect from today. This came after Sarma allegedly made threatening remarks against opposition leader Hagrama Mohilary of the Bodoland People's Front. EC had on Thursday asked Sarma for an explanation by today over his remarks."""
doc = nlp(text)
doc

The Election Commission on Friday debarred Assam Minister and BJP leader Himanta Biswa Sarma from campaigning for ongoing Assam elections for 48 hours with effect from today. This came after Sarma allegedly made threatening remarks against opposition leader Hagrama Mohilary of the Bodoland People's Front. EC had on Thursday asked Sarma for an explanation by today over his remarks.

In [10]:
# Sentence tokenization
sent_tokenize = list(doc.sents)
sent_tokenize

[The Election Commission on Friday debarred Assam Minister and BJP leader Himanta Biswa Sarma from campaigning for ongoing Assam elections for 48 hours with effect from today.,
 This came after Sarma allegedly made threatening remarks against opposition leader Hagrama Mohilary of the Bodoland People's Front.,
 EC had on Thursday asked Sarma for an explanation by today over his remarks.]

In [11]:
len(doc)

63

In [12]:
# Word tokenization
for i in doc:
    print(i)

The
Election
Commission
on
Friday
debarred
Assam
Minister
and
BJP
leader
Himanta
Biswa
Sarma
from
campaigning
for
ongoing
Assam
elections
for
48
hours
with
effect
from
today
.
This
came
after
Sarma
allegedly
made
threatening
remarks
against
opposition
leader
Hagrama
Mohilary
of
the
Bodoland
People
's
Front
.
EC
had
on
Thursday
asked
Sarma
for
an
explanation
by
today
over
his
remarks
.


In [13]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS
stopwords

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [21]:
for token in doc:
    if token.text.lower() not in stopwords:
        print(str(token))

Election
Commission
Friday
debarred
Assam
Minister
BJP
leader
Himanta
Biswa
Sarma
campaigning
ongoing
Assam
elections
48
hours
effect
today
.
came
Sarma
allegedly
threatening
remarks
opposition
leader
Hagrama
Mohilary
Bodoland
People
.
EC
Thursday
asked
Sarma
explanation
today
remarks
.


In [22]:
# Punctuation removing

for token in doc:
    if not token.is_punct:
        print(token)

The
Election
Commission
on
Friday
debarred
Assam
Minister
and
BJP
leader
Himanta
Biswa
Sarma
from
campaigning
for
ongoing
Assam
elections
for
48
hours
with
effect
from
today
This
came
after
Sarma
allegedly
made
threatening
remarks
against
opposition
leader
Hagrama
Mohilary
of
the
Bodoland
People
's
Front
EC
had
on
Thursday
asked
Sarma
for
an
explanation
by
today
over
his
remarks


In [28]:
nlp = spacy.load('en_core_web_sm')
def preprocessing(data):
    doc = nlp(data)
    lst=[]
    for token in doc:
        if token.text.lower() not in stopwords:
            if not token.is_punct:
                lst.append(str(token))
    return " ".join(lst)
text = """The Election Commission on Friday debarred Assam Minister and BJP leader Himanta Biswa Sarma from campaigning for ongoing Assam elections for 48 hours with effect from today. This came after Sarma allegedly made threatening remarks against opposition leader Hagrama Mohilary of the Bodoland People's Front. EC had on Thursday asked Sarma for an explanation by today over his remarks."""
ans = preprocessing(text)
print(ans)

Election Commission Friday debarred Assam Minister BJP leader Himanta Biswa Sarma campaigning ongoing Assam elections 48 hours effect today came Sarma allegedly threatening remarks opposition leader Hagrama Mohilary Bodoland People EC Thursday asked Sarma explanation today remarks


In [29]:
# Lemmatization

for token in doc:
    print(f'{token} --> {token.lemma_}')

The --> the
Election --> Election
Commission --> Commission
on --> on
Friday --> Friday
debarred --> debar
Assam --> Assam
Minister --> Minister
and --> and
BJP --> BJP
leader --> leader
Himanta --> Himanta
Biswa --> Biswa
Sarma --> Sarma
from --> from
campaigning --> campaign
for --> for
ongoing --> ongoing
Assam --> Assam
elections --> election
for --> for
48 --> 48
hours --> hour
with --> with
effect --> effect
from --> from
today --> today
. --> .
This --> this
came --> come
after --> after
Sarma --> Sarma
allegedly --> allegedly
made --> make
threatening --> threatening
remarks --> remark
against --> against
opposition --> opposition
leader --> leader
Hagrama --> Hagrama
Mohilary --> Mohilary
of --> of
the --> the
Bodoland --> Bodoland
People --> People
's --> 's
Front --> Front
. --> .
EC --> EC
had --> have
on --> on
Thursday --> Thursday
asked --> ask
Sarma --> Sarma
for --> for
an --> an
explanation --> explanation
by --> by
today --> today
over --> over
his --> his
remarks --

In [30]:

for token in doc:
    print(f'{token} --> {token.lemma_} --> {token.pos_} --> {spacy.explain(token.tag_)}')

The --> the --> DET --> determiner
Election --> Election --> PROPN --> noun, proper singular
Commission --> Commission --> PROPN --> noun, proper singular
on --> on --> ADP --> conjunction, subordinating or preposition
Friday --> Friday --> PROPN --> noun, proper singular
debarred --> debar --> VERB --> verb, past tense
Assam --> Assam --> PROPN --> noun, proper singular
Minister --> Minister --> PROPN --> noun, proper singular
and --> and --> CCONJ --> conjunction, coordinating
BJP --> BJP --> PROPN --> noun, proper singular
leader --> leader --> NOUN --> noun, singular or mass
Himanta --> Himanta --> PROPN --> noun, proper singular
Biswa --> Biswa --> PROPN --> noun, proper singular
Sarma --> Sarma --> PROPN --> noun, proper singular
from --> from --> ADP --> conjunction, subordinating or preposition
campaigning --> campaign --> VERB --> verb, gerund or present participle
for --> for --> ADP --> conjunction, subordinating or preposition
ongoing --> ongoing --> ADJ --> adjective
Assam

In [31]:
doc.ents

(The Election Commission,
 Friday,
 BJP,
 Himanta Biswa Sarma,
 48 hours,
 today,
 Sarma,
 Hagrama Mohilary,
 the Bodoland People's Front,
 EC,
 Thursday,
 Sarma,
 today)

In [32]:
for token in doc.ents:
    print(f'{token} --> {token.label_}')

The Election Commission --> ORG
Friday --> DATE
BJP --> ORG
Himanta Biswa Sarma --> PERSON
48 hours --> TIME
today --> DATE
Sarma --> PERSON
Hagrama Mohilary --> PERSON
the Bodoland People's Front --> ORG
EC --> ORG
Thursday --> DATE
Sarma --> PERSON
today --> DATE


In [34]:
from spacy import displacy

displacy.render(doc,style='ent')

In [35]:
displacy.render(doc,style='dep')