In [1]:
import pandas as pd
from collections import Counter 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams

### Reading in the data

In [2]:
filings = pd.read_csv('data/nc_validation_filings.csv')
filings.head()

Unnamed: 0,ticker,accession_number
0,ACAD,0001564590-18-003526
1,ACC,0001283630-18-000024
2,AFSI,0001365555-18-000052
3,AHL,0001267395-18-000024
4,AKRX,0001628280-18-002518


In [3]:
purchase = pd.read_csv('data/share_repurchase_paragraphs.csv')
aapl = purchase[purchase['ticker'] == 'AAPL']
aapl

Unnamed: 0,ticker,accession_number,data_key_friendly_name,text,data_value,reported_data_value,reported_units,paragraph_text
15,AAPL,0000320193-17-000070,Share Repurchase Authorization,share repurchase authorization,210000000000,210.0,billions,"Share Repurchase Program In May 2017, the Co..."
16,AAPL,0000320193-17-000070,Share Repurchase Utilization,utilized,166000000000,166.0,billions,"Share Repurchase Program In May 2017, the Co..."
17,AAPL,0000320193-17-000070,Amount Spent on Share Repurchases,Total open market common stock repurchases,18001000000,18001.0,millions,Number of Average ...
18,AAPL,0000320193-17-000070,Amount Spent on Share Repurchases,February 2017 ASR,3000000000,3000.0,millions,Number of Average ASR ...
19,AAPL,0000320193-17-000070,Share Repurchase Count,May 2017 ASR,20108000,20108.0,thousands,Number of Average ASR ...
20,AAPL,0000320193-17-000070,Share Repurchase Count,August 2017 ASR,15069000,15069.0,thousands,Number of Average ASR ...
21,AAPL,0000320193-17-000070,Share Repurchase Count,Total open market common stock repurchases,134832000,134832.0,thousands,Number of Average ...
22,AAPL,0000320193-17-000070,Share Repurchase Count,November 2016 ASR,51157000,51157.0,thousands,Number of Average ASR ...
23,AAPL,0000320193-17-000070,Share Repurchase Count,February 2017 ASR,20949000,20949.0,thousands,Number of Average ASR ...
24,AAPL,0000320193-17-000070,Share Repurchase Intention,Total,44023000000,44023.0,millions,Total Number of Approximate Dollar Valu...


### Exploratory Data Analysis

In [4]:
text = purchase[purchase["data_key_friendly_name"] == 'Share Repurchase Authorization'].text
text

1       The 2015 share repurchase program authorizes t...
6                   share repurchase programs aggregating
13                                     repurchase program
15                         share repurchase authorization
32                                             authorized
34      the Company's board of directors authorized a ...
38                          authorization was in addition
39                              authorized the repurchase
46                                             repurchase
50      In the aggregate, the Board of Directors has a...
58          total stock repurchase authorization of up to
80              the Company's Board of Directors approved
87      our Board of Directors authorized an additiona...
94      publicly announced share repurchase authorizat...
100                   the Board of Directors authorized a
109     we announced that our Board of Directors had a...
115                   the Board of Directors authorized a
122           

In [5]:
key_text = purchase["data_key_friendly_name"].unique()
key_text

array(['Share Repurchase Authorization Date',
       'Share Repurchase Authorization', 'Share Repurchase Intention',
       'Share Repurchase Count', 'Amount Spent on Share Repurchases',
       'Share Repurchase Utilization', 'Unknown Share Repurchase Data'], dtype=object)

### Processing the Data : Tokenizing, lowercasing and removing stop words for each of the data_key_friendly_name and picking the most common words

In [6]:
tokens =   [word_tokenize(t.lower()) for t in text]
def flatten(lst):
    return sum( ([x] if not isinstance(x, list) else flatten(x)
         for x in lst), [] )
list_tokens = flatten(tokens)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_tokens if t not in english_stops]
repurchase_text = Counter(no_stops)  
repurchase_text.most_common(20)



[('repurchase', 260),
 ('board', 227),
 ('authorized', 202),
 ('directors', 188),
 ('program', 126),
 ('company', 93),
 ('share', 78),
 ("'s", 66),
 ('approved', 63),
 ('stock', 54),
 ('additional', 52),
 ('authorization', 50),
 (',', 42),
 ('new', 37),
 ('announced', 31),
 ('purchase', 29),
 ('common', 25),
 ('aggregate', 19),
 ('authorizing', 18),
 ('total', 16)]

In [7]:
para_text = purchase[purchase["data_key_friendly_name"] == 'Share Repurchase Authorization'].paragraph_text
para_text.head()

1     On May 28, 2015 we  announced that our board  ...
6     4. Share Repurchase Programs and Dividends   S...
13    The Company's stock repurchase program allows ...
15    Share Repurchase Program   In May 2017, the Co...
32    On February 15, 2018, AbbVie's board of direct...
Name: paragraph_text, dtype: object

In [8]:
# alpha_token =  [t for t in list_tokens if t.isalpha()]
# alpha_token

In [9]:
para_tokens =   [word_tokenize(t.lower()) for t in para_text]
def flatten(lst):
    return sum( ([x] if not isinstance(x, list) else flatten(x)
         for x in lst), [] )
list_tokens = flatten(para_tokens)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_tokens if t not in english_stops]
repurchase_para = Counter(no_stops)  
repurchase_para.most_common(20)


[(',', 2207),
 ('$', 1063),
 ('repurchase', 1046),
 ('stock', 788),
 ('program', 759),
 ('.', 728),
 ('2017', 573),
 ('common', 548),
 ('board', 529),
 ('company', 525),
 ('million', 522),
 ('share', 511),
 ('billion', 501),
 ('shares', 491),
 ('directors', 441),
 ('authorized', 412),
 ("'s", 349),
 ('2016', 256),
 ('december', 254),
 ('repurchases', 244)]

In [10]:
intent_text = purchase[purchase["data_key_friendly_name"] == 'Share Repurchase Intention'].text
util_text = purchase[purchase["data_key_friendly_name"] == 'Share Repurchase Utilization'].text
unk_repurchase_text = purchase[purchase["data_key_friendly_name"] == 'Unknown Share Repurchase Data'].text


In [11]:
# bag_repurchase = ['Board', 'Directors','authoriz','purchase','approve','program']

In [12]:
intent_tokens =   [word_tokenize(t.lower()) for t in intent_text]
def flatten(lst):
    return sum( ([x] if not isinstance(x, list) else flatten(x)
         for x in lst), [] )
list_tokens = flatten(intent_tokens)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_tokens if t not in english_stops]
intent_text = Counter(no_stops)  
intent_text.most_common(20)

[('repurchase', 104),
 ('2017', 76),
 (',', 72),
 ('program', 62),
 ('remaining', 58),
 ('december', 49),
 ('authorization', 44),
 ('available', 40),
 ('31', 39),
 ('share', 38),
 ('remained', 34),
 ('board', 34),
 ('authorized', 34),
 ('stock', 29),
 ('total', 26),
 ('company', 22),
 ('directors', 22),
 ('1', 21),
 ('-', 17),
 ('common', 16)]

In [13]:
util_tokens =   [word_tokenize(t.lower()) for t in util_text]
def flatten(lst):
    return sum( ([x] if not isinstance(x, list) else flatten(x)
         for x in lst), [] )
list_tokens = flatten(util_tokens)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_tokens if t not in english_stops]
util_text = Counter(no_stops)  
util_text.most_common(20)

[('repurchased', 14),
 ('total', 12),
 (',', 12),
 ('cost', 11),
 ('aggregate', 9),
 ('2017', 8),
 ('company', 7),
 ('approximately', 5),
 ('program', 5),
 ('repurchase', 5),
 ('december', 4),
 ('purchased', 4),
 ('share', 4),
 ('shares', 3),
 ('31', 3),
 ('authorization', 3),
 ('inception', 2),
 ('amount', 2),
 ('board', 2),
 ('directors', 2)]

In [14]:
unk_tokens =   [word_tokenize(t.lower()) for t in unk_repurchase_text]
def flatten(lst):
    return sum( ([x] if not isinstance(x, list) else flatten(x)
         for x in lst), [] )
list_tokens = flatten(unk_tokens)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_tokens if t not in english_stops]
unk_text = Counter(no_stops)  
unk_text.most_common(20)

[('repurchase', 79),
 ('board', 58),
 ('authorized', 56),
 ('directors', 53),
 ('program', 31),
 ('company', 27),
 ('shares', 25),
 ("'s", 21),
 ('authorization', 18),
 ('share', 15),
 (',', 13),
 ('approved', 12),
 ('available', 12),
 ('purchase', 12),
 ('repurchased', 11),
 ('total', 11),
 ('december', 11),
 ('2017', 9),
 ('stock', 9),
 ('additional', 8)]

### ngrams NLP analysis for data_key_friendly_name text field

In [15]:
    
def get_ngrams(text, n ):
        n_grams = ngrams(word_tokenize(t.lower()), n)
        return [ ' '.join(grams) for grams in n_grams]
ngrams_text = []    
for t in text:
        ngrams_text.append(get_ngrams(t, 3 )) 
# bigrams_text
list_ngrams = flatten(ngrams_text)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_ngrams if t not in english_stops]
repurchase_text_tri = Counter(no_stops)  


  after removing the cwd from sys.path.


### Creating a Dictionary

In [16]:
trigram_dict = dict(repurchase_text_tri.most_common(20))
trigram_dict.keys()

dict_keys(['board of directors', 'of directors authorized', 'of up to', 'our board of', 'the repurchase of', "'s board of", 'share repurchase program', "the company 's", 'the board of', 'of directors approved', 'authorized the repurchase', "company 's board", 'directors authorized a', 'repurchase up to', 'repurchase of up', 'directors authorized the', 'to repurchase up', 'directors approved a', 'stock repurchase program', 'authorized a new'])

In [17]:
# for filename in os.listdir(path):
#     if filename.endswith('.html'):
#        fname = os.path.join(path,filename)
#        with open(fname, 'r', encoding="utf8") as f:
#            soup = BeautifulSoup(f.read(),'html.parser')
#            soup.head.extract()
#            soup = soup.get_text().strip()
# util_text

In [18]:

util_text = purchase[purchase["data_key_friendly_name"] == 'Share Repurchase Utilization'].text

def get_ngrams(text, n ):
        n_grams = ngrams(word_tokenize(t.lower()), n)
        return [ ' '.join(grams) for grams in n_grams]
ngrams_text = []    
for t in util_text:
        ngrams_text.append(get_ngrams(t, 3 )) 
# bigrams_text
list_ngrams = flatten(ngrams_text)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_ngrams if t not in english_stops]
util_text_tri = Counter(no_stops)  
util_text_tri.most_common(20)

  


[('a total of', 4),
 ('for a total', 4),
 (', 2017 ,', 4),
 ('2017 , the', 4),
 (', the company', 4),
 ('the company had', 3),
 ('a total cost', 3),
 ('total cost of', 3),
 ('december 31 ,', 3),
 ('31 , 2017', 3),
 ('aggregate cost of', 3),
 ('company had repurchased', 2),
 ('repurchased a total', 2),
 ('as of december', 2),
 ('of december 31', 2),
 ('the company has', 2),
 (', through december', 2),
 ('an aggregate cost', 2),
 ('board of directors', 2),
 ('share repurchase program', 2)]

In [30]:
trigram_dict_util = dict(util_text_tri.most_common(20))
trigram_dict_util.keys()
# trigram_dict_util.keys()

dict_keys(['a total of', 'for a total', ', 2017 ,', '2017 , the', ', the company', 'the company had', 'a total cost', 'total cost of', 'december 31 ,', '31 , 2017', 'aggregate cost of', 'company had repurchased', 'repurchased a total', 'as of december', 'of december 31', 'the company has', ', through december', 'an aggregate cost', 'board of directors', 'share repurchase program'])

In [20]:
intent_text = purchase[purchase["data_key_friendly_name"] == 'Share Repurchase Intention'].text
def get_ngrams(text, n ):
        n_grams = ngrams(word_tokenize(t.lower()), n)
        return [ ' '.join(grams) for grams in n_grams]
ngrams_text = []    
for t in intent_text:
        ngrams_text.append(get_ngrams(t, 3 )) 
# bigrams_text
list_ngrams = flatten(ngrams_text)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_ngrams if t not in english_stops]
intent_text_tri = Counter(no_stops)  
intent_text_tri.most_common(20)

  after removing the cwd from sys.path.


[('31 , 2017', 32),
 ('december 31 ,', 22),
 ('board of directors', 22),
 ('share repurchase program', 16),
 ('stock repurchase program', 14),
 ("the company 's", 13),
 ('remained available for', 11),
 ('our board of', 11),
 ('repurchase up to', 10),
 ('share repurchase authorization', 10),
 ('available under the', 10),
 ('to repurchase up', 9),
 ('repurchase under the', 8),
 ('under the program', 8),
 ('1 , 2017', 8),
 ('as of december', 7),
 ('for repurchase under', 7),
 ('of directors authorized', 7),
 ('30 , 2017', 6),
 ('available for repurchase', 6)]

In [29]:
trigram_dict_intent = dict(intent_text_tri.most_common(20))
trigram_dict_intent.keys()

dict_keys(['31 , 2017', 'december 31 ,', 'board of directors', 'share repurchase program', 'stock repurchase program', "the company 's", 'remained available for', 'our board of', 'repurchase up to', 'share repurchase authorization', 'available under the', 'to repurchase up', 'repurchase under the', 'under the program', '1 , 2017', 'as of december', 'for repurchase under', 'of directors authorized', '30 , 2017', 'available for repurchase'])

In [22]:
unk_repurchase_text = purchase[purchase["data_key_friendly_name"] == 'Unknown Share Repurchase Data'].text
def get_ngrams(text, n ):
        n_grams = ngrams(word_tokenize(t.lower()), n)
        return [ ' '.join(grams) for grams in n_grams]
ngrams_text = []    
for t in unk_repurchase_text:
        ngrams_text.append(get_ngrams(t, 3 )) 
# bigrams_text
list_ngrams = flatten(ngrams_text)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_ngrams if t not in english_stops]
unk_text_tri = Counter(no_stops)  
unk_text_tri.most_common(20)

  after removing the cwd from sys.path.


[('board of directors', 53),
 ('of directors authorized', 25),
 ('of up to', 24),
 ("'s board of", 19),
 ('authorized the repurchase', 18),
 ('the repurchase of', 18),
 ("the company 's", 16),
 ('the board of', 15),
 ('directors authorized the', 15),
 ("company 's board", 14),
 ('repurchase up to', 14),
 ('our board of', 14),
 ('repurchase of up', 12),
 ('to repurchase up', 12),
 ('share repurchase program', 12),
 ('of directors approved', 8),
 ('available for repurchase', 8),
 ('the purchase of', 8),
 ('for repurchase under', 7),
 ('repurchase under the', 7)]

In [28]:
trigram_dict_unk = dict(unk_text_tri.most_common(20))
trigram_dict_unk.keys()

dict_keys(['board of directors', 'of directors authorized', 'of up to', "'s board of", 'authorized the repurchase', 'the repurchase of', "the company 's", 'the board of', 'directors authorized the', "company 's board", 'repurchase up to', 'our board of', 'repurchase of up', 'to repurchase up', 'share repurchase program', 'of directors approved', 'available for repurchase', 'the purchase of', 'for repurchase under', 'repurchase under the'])

In [36]:
#  Share Repurchase Authorization Date'

amt_repurchase_text = purchase[purchase["data_key_friendly_name"] == 'Amount Spent on Share Repurchases'].text
def get_ngrams(text, n ):
        n_grams = ngrams(word_tokenize(t.lower()), n)
        return [ ' '.join(grams) for grams in n_grams]
ngrams_text = []    
for t in date_repurchase_text:
        ngrams_text.append(get_ngrams(t, 3 )) 
# bigrams_text
list_ngrams = flatten(ngrams_text)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_ngrams if t not in english_stops]
amt_text_tri = Counter(no_stops)  
dict(amt_text_tri.most_common(20)).keys()


  


dict_keys(['a cost of', 'total cost of', 'at a cost', 'of common stock', 'a total cost', 'aggregate cost of', 'for a total', 'at a total', 'a total of', 'cost of shares', 'of shares repurchased', 'at an aggregate', 'an aggregate cost', 'in millions )', '( in millions', 'repurchase of common', 'december 31 ,', '31 , 2017', 'aggregate purchase price', 'cost of repurchases'])

In [37]:
count_repurchase_text = purchase[purchase["data_key_friendly_name"] == 'Share Repurchase Count'].text
def get_ngrams(text, n ):
        n_grams = ngrams(word_tokenize(t.lower()), n)
        return [ ' '.join(grams) for grams in n_grams]
ngrams_text = []    
for t in date_repurchase_text:
        ngrams_text.append(get_ngrams(t, 3 )) 
# bigrams_text
list_ngrams = flatten(ngrams_text)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_ngrams if t not in english_stops]
count_text_tri = Counter(no_stops)  
dict(count_text_tri.most_common(20)).keys()

  after removing the cwd from sys.path.


dict_keys(['a cost of', 'total cost of', 'at a cost', 'of common stock', 'a total cost', 'aggregate cost of', 'for a total', 'at a total', 'a total of', 'cost of shares', 'of shares repurchased', 'at an aggregate', 'an aggregate cost', 'in millions )', '( in millions', 'repurchase of common', 'december 31 ,', '31 , 2017', 'aggregate purchase price', 'cost of repurchases'])

In [25]:
share_repurchase_auth = ['board of directors', 'of directors authoriz', 'of up to', 'our board of', 'the repurchase of', "'s board of", 
 'share repurchase program', "the company 's", 'the board of', 'of directors approved', 'authorized the repurchase',
 "company 's board", 'directors authorized a', 'repurchase up to', 'repurchase of up', 'directors authorized the', 
 'to repurchase up', 'directors approved a', 'stock repurchase program', 'authorized a new']

In [None]:
Unknown_Share_Repurchase_Data = ['board of director', 'of directors authoriz', 'of up to', "'s board of", 'authorized the repurchase', 
 'the repurchase of', "the company 's", 'the board of', 'directors authorized the', "company 's board",'repurchase up to', 'our board of', 'repurchase of up', 'to repurchase up', 'share repurchase program', 
 'of directors approv', 'available for repurchase', 'the purchase of', 'for repurchase under', 'repurchase under the']

In [None]:
Share_Repurchase_Intention = ['31 , 2017', 'december 31 ,', 'board of directors', 'share repurchase program', 'stock repurchase program', 
 "the company 's", 'remained available for', 'our board of', 'repurchase up to', 'share repurchase authorization',
 'available under the', 'to repurchase up', 'repurchase under the', 'under the program', '1 , 2017', 
 'as of december', 'for repurchase under', 'of directors authorized', '30 , 2017', 'available for repurchase']


In [None]:
Share_Repurchase_Utilization = ['a total of', 'for a total', ', 2017 ,', '2017 , the', ', the company', 
'the company had', 'a total cost', 'total cost of', 'december 31 ,', '31 , 2017', 'aggregate cost of', 
'company had repurchased', 'repurchased a total', 'as of december', 'of december 31', 'the company has', 
', through december', 'an aggregate cost', 'board of directors', 'share repurchase program']


In [None]:
Amount_Spent_on_Share_Repurchases = ['a cost of', 'total cost of', 'at a cost', 'of common stock', 'a total cost', 'aggregate cost of', 
 'for a total', 'at a total', 'a total of', 'cost of shares', 'of shares repurchased', 'at an aggregate', 'an aggregate cost', 
 'in millions )', '( in millions', 'repurchase of common', 'december 31 ,', '31 , 2017', 'aggregate purchase price', 'cost of repurchases']



In [None]:
Share_Repurchase_Count = ['a cost of', 'total cost of', 'at a cost', 'of common stock', 'a total cost',
'aggregate cost of', 'for a total', 'at a total', 'a total of', 'cost of shares', 'of shares repurchased', 
'at an aggregate', 'an aggregate cost', 'in millions )', '( in millions', 'repurchase of common', 'december 31 ,', 
'31 , 2017', 'aggregate purchase price', 'cost of repurchases']
