In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp('Elon flew to mars yesterday. He carried masala with him.')

for token in doc:
    print(token, " | ", token.pos_ , " | ", spacy.explain(token.pos_))

Elon  |  PROPN  |  proper noun
flew  |  VERB  |  verb
to  |  ADP  |  adposition
mars  |  NOUN  |  noun
yesterday  |  NOUN  |  noun
.  |  PUNCT  |  punctuation
He  |  PRON  |  pronoun
carried  |  VERB  |  verb
masala  |  NOUN  |  noun
with  |  ADP  |  adposition
him  |  PRON  |  pronoun
.  |  PUNCT  |  punctuation


In [4]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [7]:
doc = nlp('Wow! Dr. Strange made 265 milion $ on the very first day')

for token in doc:
    print(token, " | ", token.pos_ , " | ", spacy.explain(token.pos_), " | ", token.tag_, " | ", spacy.explain(token.tag_))

Wow  |  INTJ  |  interjection  |  UH  |  interjection
!  |  PUNCT  |  punctuation  |  .  |  punctuation mark, sentence closer
Dr.  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
Strange  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
made  |  VERB  |  verb  |  VBD  |  verb, past tense
265  |  NUM  |  numeral  |  CD  |  cardinal number
milion  |  NOUN  |  noun  |  NN  |  noun, singular or mass
$  |  SYM  |  symbol  |  $  |  symbol, currency
on  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
the  |  DET  |  determiner  |  DT  |  determiner
very  |  ADV  |  adverb  |  RB  |  adverb
first  |  ADJ  |  adjective  |  JJ  |  adjective (English), other noun-modifier (Chinese)
day  |  NOUN  |  noun  |  NN  |  noun, singular or mass


In [11]:
doc = nlp("He quit the job!")

print(doc[1], " ", doc[1].tag_ ," ", spacy.explain(doc[1].tag_))

quit   VBD   verb, past tense


In [14]:
earnings_text="""Microsoft Corp. today announced the following results for the quarter ended December 31, 2021, as compared to the corresponding period of last fiscal year:

·         Revenue was $51.7 billion and increased 20%
·         Operating income was $22.2 billion and increased 24%
·         Net income was $18.8 billion and increased 21%
·         Diluted earnings per share was $2.48 and increased 22%
“Digital technology is the most malleable resource at the world’s disposal to overcome constraints and reimagine everyday work and life,” said Satya Nadella, chairman and chief executive officer of Microsoft. “As tech as a percentage of global GDP continues to increase, we are innovating and investing across diverse and growing markets, with a common underlying technology stack and an operating model that reinforces a common strategy, culture, and sense of purpose.”
“Solid commercial execution, represented by strong bookings growth driven by long-term Azure commitments, increased Microsoft Cloud revenue to $22.1 billion, up 32% year over year” said Amy Hood, executive vice president and chief financial officer of Microsoft."""

doc = nlp(earnings_text)

filtered_tokens = []

for token in doc:
    if token.pos_ not in ["SPACE", "PUNCT", "X"]:
        filtered_tokens.append(token)
        
filtered_tokens

[Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter,
 ended,
 December,
 31,
 2021,
 as,
 compared,
 to,
 the,
 corresponding,
 period,
 of,
 last,
 fiscal,
 year,
 Revenue,
 was,
 $,
 51.7,
 billion,
 and,
 increased,
 20,
 %,
 Operating,
 income,
 was,
 $,
 22.2,
 billion,
 and,
 increased,
 24,
 %,
 Net,
 income,
 was,
 $,
 18.8,
 billion,
 and,
 increased,
 21,
 %,
 Diluted,
 earnings,
 per,
 share,
 was,
 $,
 2.48,
 and,
 increased,
 22,
 %,
 Digital,
 technology,
 is,
 the,
 most,
 malleable,
 resource,
 at,
 the,
 world,
 ’s,
 disposal,
 to,
 overcome,
 constraints,
 and,
 reimagine,
 everyday,
 work,
 and,
 life,
 said,
 Satya,
 Nadella,
 chairman,
 and,
 chief,
 executive,
 officer,
 of,
 Microsoft,
 As,
 tech,
 as,
 a,
 percentage,
 of,
 global,
 GDP,
 continues,
 to,
 increase,
 we,
 are,
 innovating,
 and,
 investing,
 across,
 diverse,
 and,
 growing,
 markets,
 with,
 a,
 common,
 underlying,
 technology,
 stack,
 and,
 an,
 operating,

In [17]:

count = doc.count_by(spacy.attrs.POS) # count will become a dictionary of key,value pairs of pos hash and its frequency.
count

{96: 13,
 92: 46,
 100: 24,
 90: 9,
 85: 16,
 93: 16,
 97: 27,
 98: 1,
 84: 20,
 103: 10,
 87: 6,
 99: 5,
 89: 12,
 86: 3,
 94: 3,
 95: 2}

In [19]:
doc.vocab[96].text

'PROPN'

In [20]:
for k,v in count.items():
    print(doc.vocab[k].text, "|",v)

PROPN | 13
NOUN | 46
VERB | 24
DET | 9
ADP | 16
NUM | 16
PUNCT | 27
SCONJ | 1
ADJ | 20
SPACE | 10
AUX | 6
SYM | 5
CCONJ | 12
ADV | 3
PART | 3
PRON | 2


# Exercise

In [34]:
with open("news_story.txt","r") as file:
    news_text = file.read()


In [35]:
news_text[:500]

'Inflation rose again in April, continuing a climb that has pushed consumers to the brink and is threatening the economic expansion, the Bureau of Labor Statistics reported Wednesday.\n\nThe consumer price index, a broad-based measure of prices for goods and services, increased 8.3% from a year ago, higher than the Dow Jones estimate for an 8.1% gain. That represented a slight ease from March’s peak but was still close to the highest level since the summer of 1982.\n\nRemoving volatile food and energ'

In [36]:
# extracting all the nouns.

doc = nlp(news_text)

nouns = [token for token in doc if token.pos_ in ["PROPN","NOUN"]]

In [37]:
nouns

[Inflation,
 April,
 climb,
 consumers,
 brink,
 expansion,
 Bureau,
 Labor,
 Statistics,
 Wednesday,
 consumer,
 price,
 index,
 measure,
 prices,
 goods,
 services,
 %,
 year,
 Dow,
 Jones,
 estimate,
 %,
 gain,
 ease,
 March,
 peak,
 level,
 summer,
 food,
 energy,
 prices,
 core,
 CPI,
 %,
 expectations,
 %,
 gain,
 hopes,
 inflation,
 March,
 month,
 month,
 gains,
 expectations,
 %,
 headline,
 CPI,
 %,
 estimate,
 %,
 increase,
 core,
 outlook,
 %,
 gain,
 price,
 gains,
 workers,
 ground,
 wages,
 inflation,
 %,
 month,
 increase,
 %,
 earnings,
 year,
 earnings,
 %,
 earnings,
 %,
 Inflation,
 threat,
 recovery,
 Covid,
 pandemic,
 economy,
 stage,
 year,
 growth,
 level,
 prices,
 pump,
 grocery,
 stores,
 problem,
 inflation,
 areas,
 housing,
 auto,
 sales,
 host,
 areas,
 Federal,
 Reserve,
 officials,
 problem,
 interest,
 rate,
 hikes,
 year,
 pledges,
 inflation,
 bank,
 %,
 goal,
 Wednesday,
 data,
 Fed,
 job,
 Credits]

In [38]:
len(nouns)

112

In [43]:
# Extracting all the hash numbers of the nouns

# Extract the hash numbers of the nouns and store in a list of dictionaries
hash_dict = [{"text": token.text, "hash": token.orth} for token in nouns]

In [49]:
hash_dict


[{'text': 'Inflation', 'hash': 7352100492262208036},
 {'text': 'April', 'hash': 6762527065225415734},
 {'text': 'climb', 'hash': 11585485096337782939},
 {'text': 'consumers', 'hash': 5077457808099934087},
 {'text': 'brink', 'hash': 5276159128921712413},
 {'text': 'expansion', 'hash': 13978266348776204577},
 {'text': 'Bureau', 'hash': 8593684778044165936},
 {'text': 'Labor', 'hash': 17182204199239135824},
 {'text': 'Statistics', 'hash': 5505243898298316554},
 {'text': 'Wednesday', 'hash': 8020134267071460552},
 {'text': 'consumer', 'hash': 822906622843328326},
 {'text': 'price', 'hash': 10375657863284908246},
 {'text': 'index', 'hash': 4791143286560280626},
 {'text': 'measure', 'hash': 4134526893188418915},
 {'text': 'prices', 'hash': 12620364840477785591},
 {'text': 'goods', 'hash': 12950115817590240826},
 {'text': 'services', 'hash': 15545056930122316503},
 {'text': '%', 'hash': 16590897233515608007},
 {'text': 'year', 'hash': 14889849580704678361},
 {'text': 'Dow', 'hash': 4286818529

In [50]:
for item in hash_dict:
    for k,v in item.items():
        print(k," ",v)

text   Inflation
hash   7352100492262208036
text   April
hash   6762527065225415734
text   climb
hash   11585485096337782939
text   consumers
hash   5077457808099934087
text   brink
hash   5276159128921712413
text   expansion
hash   13978266348776204577
text   Bureau
hash   8593684778044165936
text   Labor
hash   17182204199239135824
text   Statistics
hash   5505243898298316554
text   Wednesday
hash   8020134267071460552
text   consumer
hash   822906622843328326
text   price
hash   10375657863284908246
text   index
hash   4791143286560280626
text   measure
hash   4134526893188418915
text   prices
hash   12620364840477785591
text   goods
hash   12950115817590240826
text   services
hash   15545056930122316503
text   %
hash   16590897233515608007
text   year
hash   14889849580704678361
text   Dow
hash   428681852938282305
text   Jones
hash   3661964486298931468
text   estimate
hash   16474155047323621775
text   %
hash   16590897233515608007
text   gain
hash   7762834187326165067
text   ea

In [54]:
# get the count of all pos tags in the text

# method 1: using a hashmap (dict in python)

counts = {}

for token in doc:
    posTag = token.pos_
    if posTag not in counts:
        counts[posTag] = 1
    else:
        counts[posTag] += 1

        
counts

{'NOUN': 96,
 'VERB': 27,
 'ADV': 15,
 'ADP': 39,
 'PROPN': 16,
 'PUNCT': 32,
 'DET': 34,
 'PRON': 4,
 'AUX': 13,
 'CCONJ': 10,
 'ADJ': 23,
 'SPACE': 7,
 'NUM': 19,
 'PART': 4,
 'SCONJ': 8,
 'X': 1}

In [55]:
# method 2 using the counter library in python
from collections import Counter

counts = Counter(token.pos_ for token in doc)

In [56]:
counts

Counter({'NOUN': 96,
         'VERB': 27,
         'ADV': 15,
         'ADP': 39,
         'PROPN': 16,
         'PUNCT': 32,
         'DET': 34,
         'PRON': 4,
         'AUX': 13,
         'CCONJ': 10,
         'ADJ': 23,
         'SPACE': 7,
         'NUM': 19,
         'PART': 4,
         'SCONJ': 8,
         'X': 1})