# Chapter 9: Sample Notebook

This notebook contains all code from Chapter 9: _Sentence Structure and Classification_.

In [1]:
import re

## 9.1 Identifying forward-looking sentences

In [2]:
# To identify FLS, we need a dictionary file that 
# includes future-oriented verbs and their 
# conjugations as well as terms that identify 
# references to the future. In our case, this 
# file is "fls_terms.txt."

# file path (location) to a text file with FLS 
# terms (dictionary structure: one term per line)
fls_terms_file = r".\dictionaries\fls_terms.txt" 

# next, create a list of regex expressions that 
# match FLS terms
def create_fls_regex_list(fls_terms_file:str):
    """Creates a list of regex expressions of 
    FLS terms"""
    
    # opens the specified dict_file in "r" (read) mode
    with open(fls_terms_file,"r") as file: 
        # reads the content of the file line-by-line 
        # and creates a list of FLS terms
        fls_terms = file.read().splitlines() 
        
    # creates a list of FLS regex expressions by adding 
    # word boundary (\b) anchors to the beginning and 
    # the ending of each FLS term
    fls_terms_regex = [re.compile(r'\b' + term + r'\b') for term in fls_terms] 
    return fls_terms_regex 

# creates a list of FLS regex expressions 
fls_terms_regex = create_fls_regex_list(fls_terms_file) 
print(fls_terms_regex[0:3]) 

[re.compile('\\bwill\\b'), re.compile('\\bfuture\\b'), re.compile('\\bnext fiscal\\b')]


In [3]:
def is_forward_looking(sentence:str, year:int):
    """Returns whether sentence is forward-looking."""
    
    #creates a list of regex expression that match up 
    # to 10 years into the future
    future_year_terms=[re.compile(r"[^$,]\b" + 
                                  str(y) +
                                  r"\b(?!(%|,\d|.\d))") 
                       for y in range(year+1,year+10)] 
    
    # combines FLS regex expressions, i.e., regular 
    # expressions for FLS terms and future years
    fls_terms_with_future_years = fls_terms_regex + future_year_terms 
    
    for fls_term in fls_terms_with_future_years: 
        #fls_term.search(sentence) returns a match object 
        # if there is a match, and "None" if there is no 
        # FLS term match in the sentence
        if fls_term.search(sentence): 
            return True 
    return False
    
#Input text - excerpt from Apple's Q4 2018 
# Earnings Conference Call Transcript    
text = """Finally, we launched a completely new website 
experience for Atlanta. The new online experience 
provides a modern and fresh brand look and includes 
enhanced simplicity and flexibility for shopping and 
buying that easily transitions to a home delivery or 
in-store experience. We are excited to put the customer 
in the driver seat. This experience is a unique and 
powerful integration of our own in-store and online 
capabilities. Keep in mind, we will continue to improve 
both the customer and associate experience in Atlanta 
and use these earnings to inform how we roll out into 
other markets. As we previously announced, we 
anticipate having the omni channel experience available
to the majority of our customers by February 2020. To 
expand omni channel, we anticipate opening additional 
customer experience centers. We're currently in the 
process of planning the next locations while taking 
state regulations into consideration."""

sentence_regex = re.compile(r"\b[A-Z](?:[^\.!?]|\.\d)*[\.!?]")
def identify_sentences(input_text:str):
    sentences = re.findall(sentence_regex, input_text)
    return sentences
    
sentences = identify_sentences(text) 
for sentence in sentences: 
    print(is_forward_looking(sentence,2018),":", sentence)

False : Finally, we launched a completely new website 
experience for Atlanta.
False : The new online experience 
provides a modern and fresh brand look and includes 
enhanced simplicity and flexibility for shopping and 
buying that easily transitions to a home delivery or 
in-store experience.
False : We are excited to put the customer 
in the driver seat.
False : This experience is a unique and 
powerful integration of our own in-store and online 
capabilities.
True : Keep in mind, we will continue to improve 
both the customer and associate experience in Atlanta 
and use these earnings to inform how we roll out into 
other markets.
True : As we previously announced, we 
anticipate having the omni channel experience available
to the majority of our customers by February 2020.
True : To 
expand omni channel, we anticipate opening additional 
customer experience centers.
False : We're currently in the 
process of planning the next locations while taking 
state regulations into consider

## 9.2 Dictionary Approach to Sentence Classification

In [4]:
# This code implements is a simplified version of 
# sentence classification as earnings-oriented or
# not and quantitative or not as in Bozanic et 
# al.(2018)

# regex for identifying sentences
sentence_regex = re.compile(r"\b[A-Z](?:[^\.!?]|\.\d)*[\.!?]")

def identify_sentences(input_text:str):
    """Returns all sentences in the input text"""
    sentences = re.findall(sentence_regex, input_text)
    return sentences 

earn_terms = ["earnings", "EPS", "income", "loss", 
              "losses", "profit", "profits"]
quant_terms = ["thousand", "thousands", "million", 
               "millions", "billion", "billions", 
               "percent", "%", "dollar", "dollars", 
               "$"] 

# creates a list of earnings regex expressions 
earn_terms_regex = [re.compile(r'\b' + term + r'\b') 
                    for term in earn_terms] 
# creates a list of regexes for quantitative terms
quant_terms_regex = [re.compile(r'\b' + term + r'\b') 
                     for term in quant_terms]  

# checks if there is a match for at least one earnings 
# term in the input sentence
def is_earn_oriented(sentence:str):
    """Checks whether a sentence is earnings-oriented."""
    for term in earn_terms_regex:    
        if term.search(sentence, re.IGNORECASE): 
            return True 
    return False

# checks if there is a match for at least one qualitative
# term in the input sentence
def is_quantitative(sentence:str):
    """Checks whether a sentence is quantitative 
    in nature.""" 
    for term in quant_terms_regex: 
        if term.search(sentence, re.IGNORECASE): 
            return True 
    return False

# input text
text = """Operating income margins, excluding the 
restructuring charges, are projected to be in the 
range of 4.5% to 4.8%, and interest expense and 
other income are forecasted to be approximately 
$18 million and $6 million, respectively. While 
operating performance is expected to remain 
strong, Agribusiness profits are expected to be 
lower in the third and fourth quarters as pricing 
for subsequent sales will not match the high level 
of the June delivery. The Company expects its 
capital expenditures in 2008 to be approximately 
$300 million, an 8% reduction from 2007 capital 
expenditures of $326 million. During the third 
quarter, the company made further progress 
implementing the strategic cost reductions that 
will support the targeted growth investments 
announced in July 2005."""

sentences = identify_sentences(text) 

# next, we classify each sentence as earnings-
# oriented or not, quantitative or not
for sentence in sentences:
    print("***Earnings-oriented:", 
          is_earn_oriented(sentence), 
          "***Quantitative:", 
          is_quantitative(sentence),
          "---", sentence)

***Earnings-oriented: True ***Quantitative: True --- Operating income margins, excluding the 
restructuring charges, are projected to be in the 
range of 4.5% to 4.8%, and interest expense and 
other income are forecasted to be approximately 
$18 million and $6 million, respectively.
***Earnings-oriented: True ***Quantitative: False --- While 
operating performance is expected to remain 
strong, Agribusiness profits are expected to be 
lower in the third and fourth quarters as pricing 
for subsequent sales will not match the high level 
of the June delivery.
***Earnings-oriented: False ***Quantitative: True --- The Company expects its 
capital expenditures in 2008 to be approximately 
$300 million, an 8% reduction from 2007 capital 
expenditures of $326 million.
***Earnings-oriented: False ***Quantitative: False --- During the third 
quarter, the company made further progress 
implementing the strategic cost reductions that 
will support the targeted growth investments 
announced in Ju

## 9.3 Identifying Sentence Subjects and Objects

In [5]:
import spacy

# load spacy's English language model
nlp = spacy.load("en_core_web_sm") 

# a sample text
text = """Q1 revenue reached $12.7 billion. We are 
thrilled with the continued growth of Apple Card. 
We experienced some product shortages due to very 
strong customer demand for both Apple Watch and 
AirPod during the quarter. Apple is looking at 
buying U.K. startup for $1 billion."""

# parses the input text using spacy's nlp class
parsed_text = nlp(text) 

# gets a list of sentences identified by spacy
# property "sents" yields identified sentences
sentences = list(parsed_text.sents) 

# recall that function enumerate() when applied 
# to a list, returns its elements along with their
# indexes
for num,sentence in enumerate(sentences,1): 
    print("Sentence", str(num), ":", sentence)

Sentence 1 : Q1 revenue reached $12.7 billion.
Sentence 2 : We are 
thrilled with the continued growth of Apple Card. 

Sentence 3 : We experienced some product shortages due to very 
strong customer demand for both Apple Watch and 
AirPod during the quarter.
Sentence 4 : Apple is looking at 
buying U.K. startup for $1 billion.


In [6]:
def sentence_subj_obj(sentence):
    """Identifies subjects and objects in a sentence"""
    results = []
    for token in sentence:
         # records the token's text and its dependency
        entry = {"Token": token.text, 
                 "Dependency": token.dep_}
        results.append(entry)

    # spacy parses token dependencies and assigns a 
    # dependency code for each token; tokens that are
    # either objects or subjects will include "obj" or 
    # "subj" in their dependency codes; for a full list 
    # of spacy's dependencies and their codes, visit 
    # spacy.io
    
    # creates a new list of tokens and their 
    # dependencies based on results list by keeping 
    # only tokens with "obj" and "subj" dependencies
    filtered_results=[entry for entry in results 
                      if ('obj' in entry['Dependency']) 
                      or
                      ('subj' in entry['Dependency'])] 
    return filtered_results 

# recall that function enumerate() when applied to a 
# list, returns its elements along with their indexes
for num,sentence in enumerate(sentences,1): 
    print("Sentence", str(num), ":", 
          sentence_subj_obj(sentence))

Sentence 1 : [{'Token': 'revenue', 'Dependency': 'nsubj'}, {'Token': 'billion', 'Dependency': 'dobj'}]
Sentence 2 : [{'Token': 'We', 'Dependency': 'nsubj'}, {'Token': 'growth', 'Dependency': 'pobj'}, {'Token': 'Card', 'Dependency': 'pobj'}]
Sentence 3 : [{'Token': 'We', 'Dependency': 'nsubj'}, {'Token': 'shortages', 'Dependency': 'dobj'}, {'Token': 'demand', 'Dependency': 'pobj'}, {'Token': 'Watch', 'Dependency': 'pobj'}, {'Token': 'quarter', 'Dependency': 'pobj'}]
Sentence 4 : [{'Token': 'Apple', 'Dependency': 'nsubj'}, {'Token': 'startup', 'Dependency': 'dobj'}, {'Token': 'billion', 'Dependency': 'pobj'}]


In [7]:
# displacy allows to visualize a sentence structure
from spacy import displacy 

# tags all (word) tokens in an input sentence
def sentence_tagging(sentence):
    results = [] 
    for token in sentence: 
        # gets a token, its lemmatized version, POS, 
        # dependency, and checks whether it is a stop 
        # word or not
        entry = {"Token": token.text, 
                 "Lemma_Token": token.lemma_, 
                 "POS": token.pos_, 
                 "Dependency": token.dep_, 
                 "Stop_word": token.is_stop} 
        results.append(entry) 
    return results 

# applies sentence_tagging to all sentences
tagged_sentences = [sentence_tagging(s) for s in sentences] 

# prints the output for the first sentence
print(tagged_sentences[0]) 

# visualizes sentence dependency
displacy.render(parsed_text, style="dep") 

[{'Token': 'Q1', 'Lemma_Token': 'Q1', 'POS': 'PROPN', 'Dependency': 'compound', 'Stop_word': False}, {'Token': 'revenue', 'Lemma_Token': 'revenue', 'POS': 'NOUN', 'Dependency': 'nsubj', 'Stop_word': False}, {'Token': 'reached', 'Lemma_Token': 'reach', 'POS': 'VERB', 'Dependency': 'ROOT', 'Stop_word': False}, {'Token': '$', 'Lemma_Token': '$', 'POS': 'SYM', 'Dependency': 'quantmod', 'Stop_word': False}, {'Token': '12.7', 'Lemma_Token': '12.7', 'POS': 'NUM', 'Dependency': 'compound', 'Stop_word': False}, {'Token': 'billion', 'Lemma_Token': 'billion', 'POS': 'NUM', 'Dependency': 'dobj', 'Stop_word': False}, {'Token': '.', 'Lemma_Token': '.', 'POS': 'PUNCT', 'Dependency': 'punct', 'Stop_word': False}]


## 9.4 Identifying Named Entities

In [8]:
# create a dictionary with descriptions for spacy's 
# entity type codes; the list is available on spacy.io
entity_type_descriptions = {
    'PERSON':'People, including fictional.',
    'NORP':'Nationalities or religious or political groups.',
    'FAC':'Buildings, airports, highways, bridges, etc.',
    'ORG':'Companies, agencies, institutions, etc.',
    'GPE':'Countries, cities, states.',
    'LOC':'Non-GPE locations, mountain ranges, bodies of water.',
    'PRODUCT':'Objects, vehicles, foods, etc. (Not services.)',
    'EVENT':'Named hurricanes, battles, wars, sports events, etc.',
    'WORK':'OF_ART	Titles of books, songs, etc.',
    'LAW':'Named documents made into laws.',
    'LANGUAGE':'Any named language.',
    'DATE':'Absolute or relative dates or periods.',
    'TIME':'Times smaller than a day.',
    'PERCENT':'Percentage, including "%".',
    'MONEY':'Monetary values, including unit.',
    'QUANTITY':'Measurements, as of weight or distance.',
    'ORDINAL':'"first", "second", etc.',
    'CARDINAL':'Numerals that do not fall under another type.'}

# gets a list of all named entities identified 
# by spacy, and output them
# property "ents" returns all identified named 
# entities in the text
named_entities = parsed_text.ents 
for ent in named_entities: 
    # gets the named entity (ent.text)
    entity = ent.text 
    # gets the named entity type code 
    # (e.g., PERSON, ORG, etc.)
    entity_type = ent.label_ 
    # gets the named entity description from 
    # entity_type_descriptions dictionary using 
    # its type code
    entity_desc = entity_type_descriptions[entity_type] 
    
    print(f'{entity:<15}{entity_type:<10}{entity_desc}') 

Q1             CARDINAL  Numerals that do not fall under another type.
$12.7 billion  MONEY     Monetary values, including unit.
Apple Card     ORG       Companies, agencies, institutions, etc.
Apple Watch    ORG       Companies, agencies, institutions, etc.
AirPod         ORG       Companies, agencies, institutions, etc.
the quarter    DATE      Absolute or relative dates or periods.
Apple          ORG       Companies, agencies, institutions, etc.
U.K.           GPE       Countries, cities, states.
$1 billion     MONEY     Monetary values, including unit.


In [9]:
# counts the number of all words
# we assume that every token in a sentence is a word 
# unless it is punctuation.
num_words = len([token 
                 for token in parsed_text 
                 if not token.is_punct]) 

num_entities = len(named_entities) 
specificity_score = num_words / num_entities 

print('Number of named entities:', num_entities) 
print('Number of words:', num_words)
print('Specificity score:', specificity_score) 

Number of named entities: 9
Number of words: 52
Specificity score: 5.777777777777778
