In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_lg")

#### READ A NEW STORY

In [3]:
with open("news_story.txt", "r") as f:
    new_text = f.read()
new_text[:500]

'Inflation rose again in April, continuing a climb that has pushed consumers to the brink and is threatening the economic expansion, the Bureau of Labor Statistics reported Wednesday.\n\nThe consumer price index, a broad-based measure of prices for goods and services, increased 8.3% from a year ago, higher than the Dow Jones estimate for an 8.1% gain. That represented a slight ease from Marchâ€™s peak but was still close to the highest level since the summer of 1982.\n\nRemoving volatile food and ene'

#### Extract NOUN and NUM tokens

In [5]:
doc = nlp(new_text)

numeral_token = []
noun_token = []

for token in doc:
    if token.pos_ == 'NOUN':
        noun_token.append(token)
    elif token.pos_ == 'NUM':
        numeral_token.append(token)

In [6]:
noun_token[:10]

[Inflation,
 climb,
 consumers,
 brink,
 expansion,
 consumer,
 price,
 index,
 measure,
 prices]

In [7]:
numeral_token[:10]

[8.3, 8.1, 1982, 6.2, 6, â€, 0.3, 0.2, 0.6, 0.4]

#### Print a count of all POS tags

In [8]:
count = doc.count_by(spacy.attrs.POS)
count

{92: 96,
 100: 27,
 86: 15,
 85: 39,
 96: 16,
 97: 32,
 90: 34,
 95: 4,
 87: 13,
 89: 10,
 84: 24,
 103: 7,
 93: 20,
 94: 4,
 98: 8,
 99: 1,
 101: 2}

In [9]:
for k,v in count.items():
    print(doc.vocab[k].text, "|", v)

NOUN | 96
VERB | 27
ADV | 15
ADP | 39
PROPN | 16
PUNCT | 32
DET | 34
PRON | 4
AUX | 13
CCONJ | 10
ADJ | 24
SPACE | 7
NUM | 20
PART | 4
SCONJ | 8
SYM | 1
X | 2
