In [1]:
import spacy

In [7]:
nlp = spacy.load("en_core_web_sm")

In [8]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [10]:
doc = nlp("Elon flew a space shuttle to Mars yesterday. He carried the Biryani masala with him.")

for token in doc:
    print(token, "|", token.pos_, "|", spacy.explain(token.pos_))

Elon | PROPN | proper noun
flew | VERB | verb
a | DET | determiner
space | NOUN | noun
shuttle | NOUN | noun
to | ADP | adposition
Mars | PROPN | proper noun
yesterday | NOUN | noun
. | PUNCT | punctuation
He | PRON | pronoun
carried | VERB | verb
the | DET | determiner
Biryani | ADJ | adjective
masala | NOUN | noun
with | ADP | adposition
him | PRON | pronoun
. | PUNCT | punctuation


In [15]:
docu = nlp("Wow! Dr. Strange made $265 million on the very first day.")

for token in docu:
    print(token," | ", token.pos_," | ", spacy.explain(token.pos_)
          ," | ", token.tag_," | ", spacy.explain(token.tag_))

Wow  |  INTJ  |  interjection  |  UH  |  interjection
!  |  PUNCT  |  punctuation  |  .  |  punctuation mark, sentence closer
Dr.  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
Strange  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
made  |  VERB  |  verb  |  VBD  |  verb, past tense
$  |  SYM  |  symbol  |  $  |  symbol, currency
265  |  NUM  |  numeral  |  CD  |  cardinal number
million  |  NUM  |  numeral  |  CD  |  cardinal number
on  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
the  |  DET  |  determiner  |  DT  |  determiner
very  |  ADV  |  adverb  |  RB  |  adverb
first  |  ADJ  |  adjective  |  JJ  |  adjective (English), other noun-modifier (Chinese)
day  |  NOUN  |  noun  |  NN  |  noun, singular or mass
.  |  PUNCT  |  punctuation  |  .  |  punctuation mark, sentence closer


In [20]:
doc1 = nlp("He quits the job.")
doc1[1]
print(doc1[1].text, "|", doc1[1].tag_, "|", spacy.explain(doc1[1].tag_))

quits | VBZ | verb, 3rd person singular present


In [24]:
earnings_text = """Microsoft Corp. today announced the following results for the quarter ended December 31, 2023, as compared to the corresponding period of last fiscal year:

·        Revenue was $62.0 billion and increased 18% (up 16% in constant currency)

·        Operating income was $27.0 billion and increased 33%, and increased 25% non-GAAP (up 23% in constant currency)

·        Net income was $21.9 billion and increased 33%, and increased 26% non-GAAP (up 23% in constant currency)

·        Diluted earnings per share was $2.93 and increased 33%, and increased 26% non-GAAP (up 23% in constant currency) etc.
"""

In [32]:
docu = nlp(earnings_text)
filtered_tokens = []

In [33]:
for token in doc:
    if token.pos_ not in ["SPACE", "X", "PUNCT"]:
        filtered_tokens.append(token)
        print(token, "|", token.pos_, "|", spacy.explain(token.pos_))

Microsoft | PROPN | proper noun
Corp. | PROPN | proper noun
today | NOUN | noun
announced | VERB | verb
the | DET | determiner
following | VERB | verb
results | NOUN | noun
for | ADP | adposition
the | DET | determiner
quarter | NOUN | noun
ended | VERB | verb
December | PROPN | proper noun
31 | NUM | numeral
2023 | NUM | numeral
as | SCONJ | subordinating conjunction
compared | VERB | verb
to | ADP | adposition
the | DET | determiner
corresponding | ADJ | adjective
period | NOUN | noun
of | ADP | adposition
last | ADJ | adjective
fiscal | ADJ | adjective
year | NOUN | noun
Revenue | NOUN | noun
was | AUX | auxiliary
$ | SYM | symbol
62.0 | NUM | numeral
billion | NUM | numeral
and | CCONJ | coordinating conjunction
increased | VERB | verb
18 | NUM | numeral
% | NOUN | noun
up | ADV | adverb
16 | NUM | numeral
% | NOUN | noun
in | ADP | adposition
constant | ADJ | adjective
currency | NOUN | noun
Operating | VERB | verb
income | NOUN | noun
was | AUX | auxiliary
$ | SYM | symbol
27.0 |

In [34]:
filtered_tokens[:]

[Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter,
 ended,
 December,
 31,
 2023,
 as,
 compared,
 to,
 the,
 corresponding,
 period,
 of,
 last,
 fiscal,
 year,
 Revenue,
 was,
 $,
 62.0,
 billion,
 and,
 increased,
 18,
 %,
 up,
 16,
 %,
 in,
 constant,
 currency,
 Operating,
 income,
 was,
 $,
 27.0,
 billion,
 and,
 increased,
 33,
 %,
 and,
 increased,
 25,
 %,
 non,
 -,
 GAAP,
 up,
 23,
 %,
 in,
 constant,
 currency,
 Net,
 income,
 was,
 $,
 21.9,
 billion,
 and,
 increased,
 33,
 %,
 and,
 increased,
 26,
 %,
 non,
 -,
 GAAP,
 up,
 23,
 %,
 in,
 constant,
 currency,
 Diluted,
 earnings,
 per,
 share,
 was,
 $,
 2.93,
 and,
 increased,
 33,
 %,
 and,
 increased,
 26,
 %,
 non,
 -,
 GAAP,
 up,
 23,
 %,
 in,
 constant,
 currency]

In [36]:
count = doc.count_by(spacy.attrs.POS)
count

{96: 3,
 92: 34,
 100: 13,
 90: 3,
 85: 8,
 93: 20,
 97: 18,
 98: 1,
 84: 8,
 103: 9,
 87: 4,
 99: 4,
 89: 7,
 86: 4}

In [39]:
for k,v in count.items():
    print(doc.vocab[k].text, "|", v)

#doc.vocab[96].text

PROPN | 3
NOUN | 34
VERB | 13
DET | 3
ADP | 8
NUM | 20
PUNCT | 18
SCONJ | 1
ADJ | 8
SPACE | 9
AUX | 4
SYM | 4
CCONJ | 7
ADV | 4
