In [85]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz (11.1MB)
[K    100% |████████████████████████████████| 11.1MB 4.3MB/s ta 0:00:01
[?25hInstalling collected packages: en-core-web-sm
  Running setup.py install for en-core-web-sm ... [?25ldone
[?25hSuccessfully installed en-core-web-sm-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


Who do firms mention as their competitors, peers, customers, and suppliers? In this post, I extract named entities from the Business Description fields of Microsoft's annual report (Form 10-K) to answer this question. Named entities are defined by Wikipedia as real-world objects, "such as persons, locations, organizations, products, etc., that can be denoted with a proper name." There are many approaches and tools for named entity recognition (NER) and extraction. I will be using Python NLTK and SpaCy.

In [86]:
import nltk
import re
import os
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [61]:
#nltk.download("stopwords")
#nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/reggie/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

Let's start with Microsoft's 10-K from fiscal-year-ended 2018. And let's peek at the first 1,000 characters of it's Business description section ("Item 1").

In [99]:
text = open("../data/item1/msft-item1-2018","r").read()
text[:1000]

'ITEM 1. BUSINESS\n\nGENERAL\n\nEmbracing Our Future\n\nMicrosoft is a technology company whose mission is to empower every person and every organization on the planet to achieve more. We strive to create local opportunity, growth, and impact in every country around the world. Our platforms and tools help drive small business productivity, large business competitiveness, and public-sector efficiency. They also support new startups, improve educational and health outcomes, and empower human ingenuity.\n\nWe continue to transform our business to lead in the new era of the intelligent cloud and intelligent edge. We bring technology and products together into experiences and solutions that unlock value for our customers. In this next phase of innovation, computing is more powerful and ubiquitous from the cloud to the edge. Artificial intelligence (“AI”) capabilities are rapidly advancing, fueled by data and knowledge of the world. Physical and virtual worlds are coming together to create r

In [None]:
Next, we'll tokenize the text and tag the various parts-of-speech. 

In [79]:
## tokenize text
tokens = nltk.word_tokenize(text)
tokens_tagged = nltk.pos_tag(tokens)
tokens_tagged[:20]

[('ITEM', 'NNP'),
 ('1.', 'CD'),
 ('BUSINESS', 'NNP'),
 ('GENERAL', 'NNP'),
 ('Embracing', 'NNP'),
 ('Our', 'PRP$'),
 ('Future', 'NN'),
 ('Microsoft', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('technology', 'NN'),
 ('company', 'NN'),
 ('whose', 'WP$'),
 ('mission', 'NN'),
 ('is', 'VBZ'),
 ('to', 'TO'),
 ('empower', 'VB'),
 ('every', 'DT'),
 ('person', 'NN'),
 ('and', 'CC')]

In [40]:
## Remove punctuation and tokenize
text_tokens = text.translate(str.maketrans('', '', string.punctuation)).lower().split()
text_tokens[:10]

['item',
 '1',
 'business',
 'general',
 'embracing',
 'our',
 'future',
 'microsoft',
 'is',
 'a']

In [82]:
chunk_parser = nltk.RegexpParser(r"NP: {<DT>?<JJ>*<NN>}")
chunk_tokens = chunk_parser.parse(tokens_tagged[:20])
print(chunk_tokens)

(S
  ITEM/NNP
  1./CD
  BUSINESS/NNP
  GENERAL/NNP
  Embracing/NNP
  Our/PRP$
  (NP Future/NN)
  Microsoft/NNP
  is/VBZ
  (NP a/DT technology/NN)
  (NP company/NN)
  whose/WP$
  (NP mission/NN)
  is/VBZ
  to/TO
  empower/VB
  (NP every/DT person/NN)
  and/CC)


In [41]:
## remove stopwords
text_tokens2 = [word for word in text_tokens if word not in stopwords.words("english")]
text_tokens2[:50]

['item',
 '1',
 'business',
 'general',
 'embracing',
 'future',
 'microsoft',
 'technology',
 'company',
 'whose',
 'mission',
 'empower',
 'every',
 'person',
 'every',
 'organization',
 'planet',
 'achieve',
 'strive',
 'create',
 'local',
 'opportunity',
 'growth',
 'impact',
 'every',
 'country',
 'around',
 'world',
 'platforms',
 'tools',
 'help',
 'drive',
 'small',
 'business',
 'productivity',
 'large',
 'business',
 'competitiveness',
 'publicsector',
 'efficiency',
 'also',
 'support',
 'new',
 'startups',
 'improve',
 'educational',
 'health',
 'outcomes',
 'empower',
 'human']

In [42]:
from collections import Counter

In [47]:
tokens_count = Counter(text_tokens2)

In [54]:
tokens_count.most_common(50)

[('services', 100),
 ('products', 57),
 ('microsoft', 53),
 ('business', 51),
 ('devices', 46),
 ('software', 45),
 ('windows', 45),
 ('cloud', 42),
 ('customers', 39),
 ('•', 37),
 ('organizations', 37),
 ('also', 36),
 ('including', 34),
 ('development', 33),
 ('office', 32),
 ('solutions', 31),
 ('server', 31),
 ('new', 30),
 ('applications', 29),
 ('president', 29),
 ('sales', 25),
 ('vice', 22),
 ('marketing', 21),
 ('product', 21),
 ('gaming', 20),
 ('provide', 20),
 ('xbox', 20),
 ('research', 19),
 ('licenses', 19),
 ('executive', 19),
 ('productivity', 18),
 ('support', 18),
 ('operating', 18),
 ('content', 18),
 ('revenue', 18),
 ('technology', 17),
 ('computing', 17),
 ('enterprise', 17),
 ('commercial', 17),
 ('designed', 17),
 ('oems', 17),
 ('system', 17),
 ('information', 17),
 ('licensing', 17),
 ('agreements', 17),
 ('tools', 16),
 ('intelligent', 16),
 ('experiences', 16),
 ('management', 16),
 ('online', 16)]

In [87]:
corpus = nlp(text)

In [89]:
len(corpus.ents)

561

In [90]:
labels = [x.label_ for x in corpus.ents]
Counter(labels)

Counter({'ORG': 270,
         'DATE': 69,
         'CARDINAL': 49,
         'PERSON': 73,
         'ORDINAL': 11,
         'PRODUCT': 8,
         'WORK_OF_ART': 4,
         'TIME': 1,
         'FAC': 8,
         'LOC': 18,
         'GPE': 38,
         'LAW': 3,
         'EVENT': 1,
         'NORP': 3,
         'MONEY': 3,
         'PERCENT': 2})

In [91]:
displacy.render(nlp(str(text)), jupyter=True, style='ent')

In [None]:
## Using Stanford NER Tagger

In [92]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

In [97]:
st = StanfordNERTagger("/home/reggie/Documents/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz",
                      "/home/reggie/Documents/stanford-ner-2018-10-16/stanford-ner.jar",
                      encoding="utf-8")

In [102]:
tokenized_text = word_tokenize(text[:10000])
classified_text = st.tag(tokenized_text)

print(classified_text)

[('ITEM', 'O'), ('1.', 'O'), ('BUSINESS', 'O'), ('GENERAL', 'O'), ('Embracing', 'O'), ('Our', 'O'), ('Future', 'O'), ('Microsoft', 'ORGANIZATION'), ('is', 'O'), ('a', 'O'), ('technology', 'O'), ('company', 'O'), ('whose', 'O'), ('mission', 'O'), ('is', 'O'), ('to', 'O'), ('empower', 'O'), ('every', 'O'), ('person', 'O'), ('and', 'O'), ('every', 'O'), ('organization', 'O'), ('on', 'O'), ('the', 'O'), ('planet', 'O'), ('to', 'O'), ('achieve', 'O'), ('more', 'O'), ('.', 'O'), ('We', 'O'), ('strive', 'O'), ('to', 'O'), ('create', 'O'), ('local', 'O'), ('opportunity', 'O'), (',', 'O'), ('growth', 'O'), (',', 'O'), ('and', 'O'), ('impact', 'O'), ('in', 'O'), ('every', 'O'), ('country', 'O'), ('around', 'O'), ('the', 'O'), ('world', 'O'), ('.', 'O'), ('Our', 'O'), ('platforms', 'O'), ('and', 'O'), ('tools', 'O'), ('help', 'O'), ('drive', 'O'), ('small', 'O'), ('business', 'O'), ('productivity', 'O'), (',', 'O'), ('large', 'O'), ('business', 'O'), ('competitiveness', 'O'), (',', 'O'), ('and', 

In [103]:
for item in classified_text:
    if item[1] != "O":
        print(item)

('Microsoft', 'ORGANIZATION')
('Microsoft', 'ORGANIZATION')
('Microsoft', 'ORGANIZATION')
('Microsoft', 'ORGANIZATION')
('Office', 'ORGANIZATION')
('Microsoft', 'ORGANIZATION')
('Dynamics', 'ORGANIZATION')
('Microsoft', 'ORGANIZATION')
('Microsoft', 'ORGANIZATION')
('Microsoft', 'ORGANIZATION')
('LinkedIn', 'ORGANIZATION')
('Sales', 'ORGANIZATION')
('Navigator', 'ORGANIZATION')
('and', 'ORGANIZATION')
('Dynamics', 'ORGANIZATION')
('LinkedIn', 'ORGANIZATION')
('Recruiter', 'ORGANIZATION')
('Microsoft', 'ORGANIZATION')
('Azure', 'ORGANIZATION')
('Microsoft', 'ORGANIZATION')
('Azure', 'ORGANIZATION')
('Machine', 'ORGANIZATION')
('Learning', 'ORGANIZATION')
('Studio', 'ORGANIZATION')
('Azure', 'ORGANIZATION')
('Machine', 'ORGANIZATION')
('Learning', 'ORGANIZATION')
('Workbench', 'ORGANIZATION')
('Microsoft', 'ORGANIZATION')
('GitHub', 'ORGANIZATION')
(',', 'ORGANIZATION')
('Inc.', 'ORGANIZATION')
('Cortana', 'PERSON')
('Mixer', 'PERSON')
