In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [7]:
text = "Elon flew to mars on Wednesday. He carried a banana with him."

doc = nlp(text)

for token in doc:
    print(token, "|", token.pos_, "|", spacy.explain(token.pos_),
          "|", token.tag_, spacy.explain(token.tag_))

Elon | PROPN | proper noun | NNP noun, proper singular
flew | VERB | verb | VBD verb, past tense
to | ADP | adposition | IN conjunction, subordinating or preposition
mars | NOUN | noun | NNS noun, plural
on | ADP | adposition | IN conjunction, subordinating or preposition
Wednesday | PROPN | proper noun | NNP noun, proper singular
. | PUNCT | punctuation | . punctuation mark, sentence closer
He | PRON | pronoun | PRP pronoun, personal
carried | VERB | verb | VBD verb, past tense
a | DET | determiner | DT determiner
banana | NOUN | noun | NN noun, singular or mass
with | ADP | adposition | IN conjunction, subordinating or preposition
him | PRON | pronoun | PRP pronoun, personal
. | PUNCT | punctuation | . punctuation mark, sentence closer


In [5]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [9]:
doc = nlp("He quits the job")

print(doc[1].text, "|", doc[1].tag_, "|", spacy.explain(doc[1].tag_))

quits | VBZ | verb, 3rd person singular present


In [10]:
text = """Microsoft Corp. today announced the following results for the quarter ended March 31, 2024, as compared to the corresponding period of last fiscal year:
·        Revenue was $61.9 billion and increased 17%
·        Operating income was $27.6 billion and increased 23%
·        Net income was $21.9 billion and increased 20%
·        Diluted earnings per share was $2.94 and increased 20%
“Microsoft Copilot and Copilot stack are orchestrating a new era of AI transformation, driving better business outcomes across every role and industry," said Satya Nadella, chairman and chief executive officer of Microsoft.
“This quarter Microsoft Cloud revenue was $35.1 billion, up 23% year-over-year, driven by strong execution by our sales teams and partners,” said Amy Hood, executive vice president and chief financial officer of Microsoft."""

doc = nlp(text)

In [15]:
filtered_tokens = []

for token in doc:
    if token.pos_ not in ["SPACE", "X", "PUNCT"]:
        filtered_tokens.append(token)
        # print(token, "|", token.pos_, "|", spacy.explain(token.pos_))

In [16]:
filtered_tokens

[Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter,
 ended,
 March,
 31,
 2024,
 as,
 compared,
 to,
 the,
 corresponding,
 period,
 of,
 last,
 fiscal,
 year,
 Revenue,
 was,
 $,
 61.9,
 billion,
 and,
 increased,
 17,
 %,
 Operating,
 income,
 was,
 $,
 27.6,
 billion,
 and,
 increased,
 23,
 %,
 Net,
 income,
 was,
 $,
 21.9,
 billion,
 and,
 increased,
 20,
 %,
 Diluted,
 earnings,
 per,
 share,
 was,
 $,
 2.94,
 and,
 increased,
 20,
 %,
 Microsoft,
 Copilot,
 and,
 Copilot,
 stack,
 are,
 orchestrating,
 a,
 new,
 era,
 of,
 AI,
 transformation,
 driving,
 better,
 business,
 outcomes,
 across,
 every,
 role,
 and,
 industry,
 said,
 Satya,
 Nadella,
 chairman,
 and,
 chief,
 executive,
 officer,
 of,
 Microsoft,
 This,
 quarter,
 Microsoft,
 Cloud,
 revenue,
 was,
 $,
 35.1,
 billion,
 up,
 23,
 %,
 year,
 over,
 year,
 driven,
 by,
 strong,
 execution,
 by,
 our,
 sales,
 teams,
 and,
 partners,
 said,
 Amy,
 Hood,
 executive,
 vice,
 presi

In [21]:
count = doc.count_by(spacy.attrs.POS)
count

{96: 14,
 92: 36,
 100: 15,
 90: 6,
 85: 11,
 93: 16,
 97: 22,
 98: 1,
 84: 12,
 103: 10,
 87: 6,
 99: 5,
 89: 9,
 86: 1,
 95: 1}

In [22]:
doc.vocab[96].text

'PROPN'

In [23]:
for k, v in count.items():
    print(doc.vocab[k].text, v)

PROPN 14
NOUN 36
VERB 15
DET 6
ADP 11
NUM 16
PUNCT 22
SCONJ 1
ADJ 12
SPACE 10
AUX 6
SYM 5
CCONJ 9
ADV 1
PRON 1
