# Chapter 7: Sample Notebook

This notebook contains all code from Chapter 7: _Dictionary-Based Textual Analysis_.

In [1]:
import re

## 7.3 Identifying Words and Sentences in Text

In [2]:
# input text
text = """We invested in six areas of the business that account for nearly 40% of total Macy's sales. 
Dresses, fine jewelry, big ticket, men's tailored, women's shoes and beauty, these investments were aimed 
at driving growth through great products, top-performing colleagues, improved environment and enhanced 
marketing. All six areas continued to outperform the balance of the business on market share, return on 
investment and profitability. And we capture approximately 9% of the market in these categories."""

# Regex "\b[a-zA-Z\'\-]+\b" searches for all words in text, allowing apostrophes and hyphens in words, 
# e.g., company's, state-of-the-art
x = re.findall(r"\b[a-zA-Z\'\-]+\b", text)

print(x)
print(len(x))

['We', 'invested', 'in', 'six', 'areas', 'of', 'the', 'business', 'that', 'account', 'for', 'nearly', 'of', 'total', "Macy's", 'sales', 'Dresses', 'fine', 'jewelry', 'big', 'ticket', "men's", 'tailored', "women's", 'shoes', 'and', 'beauty', 'these', 'investments', 'were', 'aimed', 'at', 'driving', 'growth', 'through', 'great', 'products', 'top-performing', 'colleagues', 'improved', 'environment', 'and', 'enhanced', 'marketing', 'All', 'six', 'areas', 'continued', 'to', 'outperform', 'the', 'balance', 'of', 'the', 'business', 'on', 'market', 'share', 'return', 'on', 'investment', 'and', 'profitability', 'And', 'we', 'capture', 'approximately', 'of', 'the', 'market', 'in', 'these', 'categories']
73


In [3]:
# Regex pattern that identifies a sentence
# re.compile compile a regular expression pattern into a regular expression object in Python
sentence_regex = re.compile(r"\b[A-Z](?:[^\.!?]|\.\d)*[\.!?]") 

# function that identifies sentences
def identify_sentences (input_text:str): 
    # finds all matches of sentence_regex in input_text
    sentences = re.findall(sentence_regex, input_text) 
    return sentences

# applies identify_sentences function on text in the previous example
sentences = identify_sentences(text) 

# enumerate is a Python function that when applied to a list, returns list 
# elements along with their indexes (counter); 1 indicates that the counter
# should start from 1 instead of default 0
for counter, sentence in enumerate(sentences, 1): 
    print(counter, sentence)
    print()

1 We invested in six areas of the business that account for nearly 40% of total Macy's sales.

2 Dresses, fine jewelry, big ticket, men's tailored, women's shoes and beauty, these investments were aimed 
at driving growth through great products, top-performing colleagues, improved environment and enhanced 
marketing.

3 All six areas continued to outperform the balance of the business on market share, return on 
investment and profitability.

4 And we capture approximately 9% of the market in these categories.



In [4]:
import spacy

# load the English language model in spacy
nlp = spacy.load('en_core_web_sm') 
# create an "nlp" object that parses a textual document
a_text = nlp(text) 

# create a list of word tokens; note, this list will include punctuation marks and other symbols
token_list = [] # start with an empty list
for token in a_text:
    token_list.append(token.text) # add a token to the token_list
print(token_list) # print all identified tokens in text

sentences = list(a_text.sents) # extract sentences

# print all sentences
for counter, sentence in enumerate(sentences, 1):
    print(counter, sentence)
    print()

['We', 'invested', 'in', 'six', 'areas', 'of', 'the', 'business', 'that', 'account', 'for', 'nearly', '40', '%', 'of', 'total', 'Macy', "'s", 'sales', '.', '\n', 'Dresses', ',', 'fine', 'jewelry', ',', 'big', 'ticket', ',', 'men', "'s", 'tailored', ',', 'women', "'s", 'shoes', 'and', 'beauty', ',', 'these', 'investments', 'were', 'aimed', '\n', 'at', 'driving', 'growth', 'through', 'great', 'products', ',', 'top', '-', 'performing', 'colleagues', ',', 'improved', 'environment', 'and', 'enhanced', '\n', 'marketing', '.', 'All', 'six', 'areas', 'continued', 'to', 'outperform', 'the', 'balance', 'of', 'the', 'business', 'on', 'market', 'share', ',', 'return', 'on', '\n', 'investment', 'and', 'profitability', '.', 'And', 'we', 'capture', 'approximately', '9', '%', 'of', 'the', 'market', 'in', 'these', 'categories', '.']
1 We invested in six areas of the business that account for nearly 40% of total Macy's sales. 


2 Dresses, fine jewelry, big ticket, men's tailored, women's shoes and beau

## 7.4 Stemming and Lemmatization

In [5]:
# import Porter stemmer Module
from nltk.stem import PorterStemmer 
# import WordNet lemmatization Module
from nltk.stem import WordNetLemmatizer 

# object for Porter stemmer
stemmer = PorterStemmer() 
# object for WordNet lemmatizer
lemmatizer = WordNetLemmatizer() 

# Then, performing stemming on single words is as simple as:
print(f"Stemming for 'increasing' is {stemmer.stem('increasing')}")
print(f"Stemming for 'increases' is {stemmer.stem('increases')}")
print(f"Stemming for 'increased' is {stemmer.stem('increased')}")

# To improve the accuracy of lemmatization, we need to provide each word's part of the speech (POS)
print(f"Lemmatization for 'increasing' is {lemmatizer.lemmatize('increasing', pos='v')}") # specifying POS as verb "v"
print(f"Lemmatization for 'increases' is {lemmatizer.lemmatize('increases', pos='v')}")
print(f"Lemmatization for 'increased' is {lemmatizer.lemmatize('increased', pos='v')}")

Stemming for 'increasing' is increas
Stemming for 'increases' is increas
Stemming for 'increased' is increas
Lemmatization for 'increasing' is increase
Lemmatization for 'increases' is increase
Lemmatization for 'increased' is increase


In [6]:
from nltk.corpus import wordnet # WordNet is just another NLTK corpus reader
# nltk.download('averaged_perceptron_tagger') # uncomment this line if 'averaged_perceptron_tagger' has not been yet downloaded
from nltk import word_tokenize, pos_tag # import NLTK tokenizer and (part of speech) POS tagger
from nltk.stem import PorterStemmer # import Porter stemmer class
from nltk.stem import WordNetLemmatizer # import WordNet lemmatizer class
from collections import defaultdict # default dictionary is similar to Python's regular dictionary, but allows the dictionary to return a default value if a requested key does not exist in the dictionary

stemmer = PorterStemmer() # object for Porter stemmer
lemmatizer = WordNetLemmatizer() # object for WordNet lemmatizer

tag_map = defaultdict(lambda: wordnet.NOUN) # create a dictionary where single-letter keys are mapped to part of speech (noun, adjective, etc.) WordNet identifiers; by default, if a key does not exists the dictionary, return noun (wordnet.NOUN)
tag_map['J'] = wordnet.ADJ # add key 'J' to the dictionary indicating adjective
tag_map['V'] = wordnet.VERB # add key 'V' to the dictionary indicating verb
tag_map['R'] = wordnet.ADV # add key 'R' to the dictionary indicating adverb

text = "We delivered adjusted earnings per share of $2.12. For the year, comparable sales were down 0.7% on an owned plus licensed basis, and we delivered adjusted earnings per share of $2.91."

# function that stems text
def stem_text(text:str):
    tokens = word_tokenize(text) # split text into (word) tokens
    stemmed_text = [] # start with an empty list
    for token in tokens:
        stem = stemmer.stem(token) # stem token
        stemmed_text.append(stem) # append stemmed token to the stemmed_text list
    return " ".join(stemmed_text) # concatenate stemmed tokens elements with space (" ") in-between

# function that to lemmatizes text
def lemmatize_text(text:str):
    tokens = word_tokenize(text) # splits text into tokens
    lemmatized_text = [] # start with an empty list
    for token, tag in pos_tag(tokens):
        lemma = lemmatizer.lemmatize(token, tag_map[tag[0]]) # lemmatize word tokens, tag[0] returns POS letter identifier
        lemmatized_text.append(lemma) # append lemmatized token to the lemmatized_text list
    return " ".join(lemmatized_text) # concatenate lemmatized tokens elements with space in-between

# print stemmed version of text
print(stem_text(text))
# print lemmatized version of text
print(lemmatize_text(text))

We deliv adjust earn per share of $ 2.12 . for the year , compar sale were down 0.7 % on an own plu licens basi , and we deliv adjust earn per share of $ 2.91 .
We deliver adjusted earnings per share of $ 2.12 . For the year , comparable sale be down 0.7 % on an owned plus licensed basis , and we deliver adjusted earnings per share of $ 2.91 .


## 7.6 Dictionary-Based Word-Count Functions

In [7]:
import re
# Let us start with a simple tone analysis, where each word is equally-weighted and we do not account for negators
# First, we need to specify the locations of our dictionary files
positive_words_dict = r"./dictionaries/positive.txt" # file path (location) to a text file with positive words; every word is in a separate line in the file
negative_words_dict = r"./dictionaries/negative.txt" # file path to a text file with negative words

# To be able to match all positive and negative words from the dictionaries, we need to create a list of regular expressions corresponding to these words
# The following function reads all dictionary terms to a Python list, and converts the terms regular expressions
def create_dict_regex_list(dict_file:str):
    """Creates a list of regex expressions of dictionary terms.""" # function description (optional)
    with open(dict_file,"r") as file:  # opens the specified dict_file in "r" (read) mode 
        dict_terms = file.read().splitlines() # reads the content of the file line-by-line and creates a list of dictionary phrases
    dict_terms_regex = [re.compile(r'\b' + term + r'\b') for term in dict_terms] 
    # re.compile(pattern) in Python compiles a regular expression pattern, which can be used for matching using its re.search, re.findall, etc.
    # by adding "\b" (i.e., word boundary) on each side of a dictionary term in Regex, we force an exact match that dictionary term
    return dict_terms_regex # specifies the output of the function - in our case, a list of Regex expressions that correspond to the input dictionary file

# Now we can apply our function to create Regex lists for positive and negative dictionary terms
positive_dict_regex = create_dict_regex_list(positive_words_dict)
negative_dict_regex = create_dict_regex_list(negative_words_dict)

# print the first three entries of each Regex dictionary
print(positive_dict_regex[0:3])
print(negative_dict_regex[0:3])

[re.compile('\\bable\\b'), re.compile('\\babundance\\b'), re.compile('\\babundant\\b')]
[re.compile('\\babandon\\b'), re.compile('\\babandoned\\b'), re.compile('\\babandoning\\b')]


In [8]:
def get_tone (input_text:str):
    """Counts All and Specific Words in Text""" # function description (optional)
    
    ### Positive Words ###
    
    positive_words_matches = [re.findall(regex, input_text) for regex in positive_dict_regex] 
    # finds all regex matches and returns them as a list of lists
    # so, the output of this search will be of the following format: [['able'], [], ['abundant','abundant'], [], ... ]
    
    positive_words_counts = [len(match) for match in positive_words_matches]
    # len() measures the length of each list match
    # so, the output of this list transformation will be of the following format: [1, 0, 2, 0, ...]
    
    positive_words_sum = sum(positive_words_counts) # sums all positive word counts in the counts list above
    
    ### Negative Words ###
    
    # in similar manner, we can get word counts for negative words
    negative_words_matches = [re.findall(regex, input_text) for regex in negative_dict_regex] # finds all matches of negative words' regular expressions
    negative_words_counts = [len(match) for match in negative_words_matches] # calculates the number of matches for each dictionary term regex
    negative_words_sum = sum(negative_words_counts) # sums all negative word counts
    
    ### Total Words ###
    total_words = re.findall(r"\b[a-zA-Z\'\-]+\b", input_text) # searches for all words in text, allowing apostrophes and hyphens in words, e.g., "company's", "state-of-the-art"
    total_words_count = len(total_words) # calculates the number of all words in text
    
    # Finally, we can calculate Tone (expressed in % terms) as:
    tone = 100 * (positive_words_sum - negative_words_sum)/total_words_count
    return (total_words_count, positive_words_sum, negative_words_sum, tone)
    
# Applying our count_words function to an input text:
counts = get_tone("At FedEx Ground, we have the market leading e-commerce portfolio. We continue to see strong demand across all customer segments with our new seven-day service. We will increase our speed advantage during the New Year. Our Sunday roll-out will speed up some lanes by one and two full transit days. This will increase our advantage significantly. And as you know, we are already faster by at least one day when compared to UPS's ground service in 25% of lanes. It is also really important to note our speed advantage and seven-day service is also very valuable for the premium B2B sectors, including healthcare and perishables shippers. Now, turning to Q2, I'm not pleased with our financial results.")
# output the results as (Total Word Count, Number of Positive Words, Number of Negative Words, Tone)
print(counts)

(114, 7, 0, 6.140350877192983)


In [9]:
# First, we update our function that compiles regular expressions
def create_dict_regex_list_with_negators(dict_file:str):
    """Creates a list of regex expressions of dictionary terms."""
    with open(dict_file,"r") as file: 
        dict_terms = file.read().splitlines() # reads dictionary lines one-by-one
    dict_terms_regex =[re.compile(r"(not|never|no|none|nobody|nothing|don\'t|doesn\'t|won\'t|shan\'t|didn\'t|shouldn\'t|wouldn\'t|couldn\'t|can\'t|cannot|neither|nor)?\s(" + term + r")\b") for term in dict_terms] # the first capturing group in this Regex captures all possible negators, allowing for zero or one match as indicated by ? after the group; the second group captures dictionary terms
    return dict_terms_regex # returns a list of Regex expressions that correspond to the input dictionary file, allowing for negators

# Now we can apply our function to create Regex lists for positive and negative dictionary terms
positive_dict_regex = create_dict_regex_list_with_negators(positive_words_dict)
negative_dict_regex = create_dict_regex_list_with_negators(negative_words_dict)

# prints the first entries of each Regex dictionary
print(positive_dict_regex[0])
print(negative_dict_regex[0])

re.compile("(not|never|no|none|nobody|nothing|don\\'t|doesn\\'t|won\\'t|shan\\'t|didn\\'t|shouldn\\'t|wouldn\\'t|couldn\\'t|can\\'t|cannot|neither|nor)?\\s(able)\\b")
re.compile("(not|never|no|none|nobody|nothing|don\\'t|doesn\\'t|won\\'t|shan\\'t|didn\\'t|shouldn\\'t|wouldn\\'t|couldn\\'t|can\\'t|cannot|neither|nor)?\\s(abandon)\\b")


In [10]:
# calculates tone with negators
def get_tone2 (input_text:str):
    """Counts All and Specific Words in Text, and checks for the presence of negators""" # function description (optional)
    
    total_words = re.findall(r"\b[a-zA-Z\'\-]+\b", input_text) # find all words in text
    total_words_count = len(total_words) # calculate the number of all words
    
    # Positive Words #
    # To account for negators, we can separately count positive and negated positive words
    positive_word_count = 0 # initial values
    negated_positive_word_count = 0 # initial values
    
    for regex in positive_dict_regex:
        matches = re.findall(regex, input_text) # searches for all occurences of Regex
        for match in matches:
            if len(match)>0: # if match is not empty
                print(match) # prints the match output; this is for illustration purposes (i.e., optional)
            if match[0] == '': # if the first element of the match is empty, no negator is present
                positive_word_count += 1 # so, increase the count of positive words by 1 
            else:
                negated_positive_word_count += 1 # otherwise, a negator is present, so increase the count of negated positive words by 1
                
   # If we are simply shifting the sentiment of negated positive words (from +1 to -1), then the final positive word count is just:
    positive_words_sum = positive_word_count # i.e., count without negators
    
    # Repeat the same for Negative Words:
    negative_word_count = 0 # initial values
    negated_negative_word_count = 0 # initial values
    
    for regex in negative_dict_regex:
        matches = re.findall(regex, input_text) # search for all occurences of Regex
        for match in matches:
            if len(match)>0: # if match is not empty
                print(match) # check the match output         
            if match[0] == '': # if the first element of the match is empty, no negator is present
                negative_word_count += 1  # so, increase the count of negative words by 1
            else:
                negated_negative_word_count += 1 # otherwise, a negator is present, so increase the count of negated negative words by 1
                
   # If we are simply shifting the sentiment of negated negative words (from -1 to +1), then the final negative word count is just:
    negative_words_sum = negative_word_count # i.e., count without negators
    
    # Then, Tone is:
    tone = 100 * (positive_words_sum - negative_words_sum)/total_words_count
    return (total_words_count, positive_words_sum, negative_words_sum, tone)

# Applying function get_tone2 function to an example text:
counts = get_tone2("At FedEx Ground, we have the market leading e-commerce portfolio. We continue to see strong demand across all customer segments with our new seven-day service. We will increase our speed advantage during the New Year. Our Sunday roll-out will speed up some lanes by one and two full transit days. This will increase our advantage significantly. And as you know, we are already faster by at least one day when compared to UPS's ground service in 25% of lanes. It is also really important to note our speed advantage and seven-day service is also very valuable for the premium B2B sectors, including healthcare and perishables shippers. Now, turning to Q2, I'm not pleased with our financial results.")
# output results
print(counts)

('', 'advantage')
('', 'advantage')
('', 'advantage')
('', 'leading')
('not', 'pleased')
('', 'strong')
('', 'valuable')
(114, 6, 0, 5.2631578947368425)
