In [4]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.7.2-cp311-cp311-macosx_11_0_arm64.whl (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting spacy-legacy<3.1.0,>=3.0.11
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.5-py3-none-any.whl (22 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.10-cp311-cp311-macosx_11_0_arm64.whl (26 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.8-cp311-cp311-macosx_11_0_arm64.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.9-cp311-cp311-macosx_11_0_arm64.whl (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hColle

In [7]:
import spacy

In [8]:
nlp = spacy.load('en_core_web_sm')

In [9]:
introduction_text = ('This tutorial is about Natural Language Processing in Spacy.')

In [10]:
introduction_doc = nlp(introduction_text)

In [11]:
introduction_doc

This tutorial is about Natural Language Processing in Spacy.

In [6]:
# Extract tokens for the given doc

In [12]:
print ([token.text for token in introduction_doc])
['This', 'tutorial', 'is', 'about', 'Natural', 'Language',
'Processing', 'in', 'Spacy', '.']

['This', 'tutorial', 'is', 'about', 'Natural', 'Language', 'Processing', 'in', 'Spacy', '.']


['This',
 'tutorial',
 'is',
 'about',
 'Natural',
 'Language',
 'Processing',
 'in',
 'Spacy',
 '.']

In [13]:
about_text = ('Hello all, I am Dr. Chetana. Gus Proto is a Python developer currently'
               ' working for a London-based Fintech'
               ' company. He is interested in learning'
               ' Natural Language Processing.')

In [14]:
about_text

'Hello all, I am Dr. Chetana. Gus Proto is a Python developer currently working for a London-based Fintech company. He is interested in learning Natural Language Processing.'

In [15]:
about_doc = nlp(about_text)

In [16]:
sentences = list(about_doc.sents)

In [17]:
len(sentences)

3

In [18]:
for sentence in sentences:
    print (sentence)

Hello all, I am Dr. Chetana.
Gus Proto is a Python developer currently working for a London-based Fintech company.
He is interested in learning Natural Language Processing.


In [19]:
for token in about_doc:
    print (token, token.idx)

Hello 0
all 6
, 9
I 11
am 13
Dr. 16
Chetana 20
. 27
Gus 29
Proto 33
is 39
a 42
Python 44
developer 51
currently 61
working 71
for 79
a 83
London 85
- 91
based 92
Fintech 98
company 106
. 113
He 115
is 118
interested 121
in 132
learning 135
Natural 144
Language 152
Processing 161
. 171


In [20]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [21]:
len(spacy_stopwords)

326

In [22]:
for stop_word in list(spacy_stopwords)[:10]:
    print(stop_word)

however
beforehand
give
n‘t
he
hereby
two
both
quite
regarding


In [23]:
for token in about_doc:
    if not token.is_stop:
        print (token)

Hello
,
Dr.
Chetana
.
Gus
Proto
Python
developer
currently
working
London
-
based
Fintech
company
.
interested
learning
Natural
Language
Processing
.


In [18]:
#Lemmatization

In [24]:
conference_help_text = ('Gus is helping organize a developer'
...     'conference on Applications of Natural Language'
...     ' Processing. He keeps organizing local Python meetups'
...     ' and several internal talks at his workplace.')

In [32]:
conference_help_doc = nlp(conference_help_text)

In [33]:
for token in conference_help_doc:
    print (token, token.lemma_)

Gus Gus
is be
helping helping
organize organize
a a
developerconference developerconference
on on
Applications Applications
of of
Natural Natural
Language Language
Processing Processing
. .
He he
keeps keep
organizing organize
local local
Python Python
meetups meetup
and and
several several
internal internal
talks talk
at at
his his
workplace workplace
. .


In [22]:
# Part of Speech Tagging
for token in about_doc:
    print (token, token.tag_, token.pos_, spacy.explain(token.tag_))

Hello UH INTJ interjection
all DT DET determiner
, , PUNCT punctuation mark, comma
I PRP PRON pronoun, personal
am VBP AUX verb, non-3rd person singular present
Dr. NNP PROPN noun, proper singular
Chetana NNP PROPN noun, proper singular
. . PUNCT punctuation mark, sentence closer
Gus NNP PROPN noun, proper singular
Proto NNP PROPN noun, proper singular
is VBZ AUX verb, 3rd person singular present
a DT DET determiner
Python NNP PROPN noun, proper singular
developer NN NOUN noun, singular or mass
currently RB ADV adverb
working VBG VERB verb, gerund or present participle
for IN ADP conjunction, subordinating or preposition
a DT DET determiner
London NNP PROPN noun, proper singular
- HYPH PUNCT punctuation mark, hyphen
based VBN VERB verb, past participle
Fintech NNP PROPN noun, proper singular
company NN NOUN noun, singular or mass
. . PUNCT punctuation mark, sentence closer
He PRP PRON pronoun, personal
is VBZ AUX verb, 3rd person singular present
interested JJ ADJ adjective
in IN ADP c

In [23]:
nouns = []
adjectives = []
for token in about_doc:
    if token.pos_ == 'NOUN':
        nouns.append(token)
    if token.pos_ == 'ADJ':
        adjectives.append(token)

In [24]:
nouns

[developer, company]

In [25]:
adjectives

[interested]

In [26]:
from spacy import displacy

In [28]:
about_interest_text = ('He is interested in learning'
...     ' Natural Language Processing.')

In [29]:
about_interest_doc = nlp(about_interest_text)

In [31]:
displacy.render(about_interest_doc, style='dep')